content/documentation/query/text-query.html - jena-site - Git at Google

 <!DOCTYPE html>
 <html lang="en">
 <head>


     <title>Apache Jena - Jena Full Text Search</title>
     <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">

     <link href="/css/bootstrap.min.css" rel="stylesheet" media="screen">
     <link href="/css/bootstrap-icons.css" rel="stylesheet" media="screen"><link rel="stylesheet" type="text/css" href="https://jena.apache.org/sass/jena.d6ada6933ea246671d51bf39ceb2afb2f710a45f3793ab87e06f74f3b5d17c3d.css" integrity="sha256-1q2mkz6iRmcdUb85zrKvsvcQpF83k6uH4G9087XRfD0=">
     <link rel="shortcut icon" href="/images/favicon.ico" />

 </head>

 <body>

 <nav class="navbar navbar-expand-lg bg-body-tertiary" role="navigation">
     <div class="container">
         <div class="navbar-header">
             <button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbarNav" aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation">
                 <span class="navbar-toggler-icon"></span>
             </button>
             <a class="navbar-brand" href="/index.html">
                 <img class="logo-menu" src="/images/jena-logo/jena-logo-notext-small.png" alt="jena logo">Apache Jena</a>
         </div>

         <div class="collapse navbar-collapse" id="navbarNav">
             <ul class="navbar-nav me-auto mb-2 mb-lg-0">
                 <li id="homepage" class="nav-item"><a class="nav-link" href="/index.html"><span class="bi-house"></span> Home</a></li>
                 <li id="download" class="nav-item"><a class="nav-link" href="/download/index.cgi"><span class="bi-download"></span> Download</a></li>
                 <li class="nav-item dropdown">
                     <a href="#" class="nav-link dropdown-toggle" role="button" data-bs-toggle="dropdown" aria-expanded="false"><span class="bi-journal"></span> Docs <b class="caret"></b></a>
                     <ul class="dropdown-menu">
                         <li class="dropdown-header">Tutorials</li>
                         <li><a class="dropdown-item" href="/tutorials/index.html">Overview</a></li>
                         <li><a class="dropdown-item"  href="/documentation/fuseki2/index.html">Fuseki Triplestore</a></li>
                         <li><a class="dropdown-item"  href="/documentation/notes/index.html">How-To's</a></li>
                         <li><a class="dropdown-item"  href="/documentation/query/manipulating_sparql_using_arq.html">Manipulating SPARQL using ARQ</a></li>
                         <li><a class="dropdown-item"  href="/tutorials/rdf_api.html">RDF core API tutorial</a></li>
                         <li><a class="dropdown-item"  href="/tutorials/sparql.html">SPARQL tutorial</a></li>
                         <li><a class="dropdown-item"  href="/tutorials/using_jena_with_eclipse.html">Using Jena with Eclipse</a></li>
                         <li class="dropdown-divider"></li>
                         <li class="dropdown-header">References</li>
                         <li><a class="dropdown-item"  href="/documentation/index.html">Overview</a></li>
                         <li><a class="dropdown-item"  href="/documentation/query/index.html">ARQ (SPARQL)</a></li>
                         <li><a class="dropdown-item"  href="/documentation/io/">RDF I/O</a></li>
                         <li><a class="dropdown-item"  href="/documentation/assembler/index.html">Assembler</a></li>
                         <li><a class="dropdown-item"  href="/documentation/tools/index.html">Command-line tools</a></li>
                         <li><a class="dropdown-item"  href="/documentation/rdfs/">Data with RDFS Inferencing</a></li>
                         <li><a class="dropdown-item"  href="/documentation/geosparql/index.html">GeoSPARQL</a></li>
                         <li><a class="dropdown-item"  href="/documentation/inference/index.html">Inference API</a></li>
                         <li><a class="dropdown-item"  href="/documentation/ontology/">Ontology API</a></li>
                         <li><a class="dropdown-item"  href="/documentation/extras/querybuilder/index.html">Query Builder</a></li>
                         <li><a class="dropdown-item"  href="/documentation/rdf/index.html">RDF API</a></li>
                         <li><a class="dropdown-item"  href="/documentation/rdfconnection/">RDF Connection - SPARQL API</a></li>
                         <li><a class="dropdown-item"  href="/documentation/rdf-patch/index.html">RDF Patch</a></li>
                         <li><a class="dropdown-item"  href="/documentation/shacl/index.html">SHACL</a></li>
                         <li><a class="dropdown-item"  href="/documentation/shex/index.html">ShEx</a></li>
                         <li><a class="dropdown-item"  href="/documentation/tdb2/index.html">TDB2</a></li>
                         <li><a class="dropdown-item"  href="/documentation/query/text-query.html">Text Search</a></li>
                     </ul>
                 </li>

                 <li class="nav-item dropdown">
                     <a href="#" class="nav-link dropdown-toggle" role="button" data-bs-toggle="dropdown" aria-expanded="false"><span class="bi-journal-code"></span> Javadoc <b class="caret"></b></a>
                     <ul class="dropdown-menu">
                         <li><a class="dropdown-item" href="/documentation/javadoc.html">All Javadoc</a></li>
                         <li><a class="dropdown-item" href="/documentation/javadoc/arq/">ARQ</a></li>
                         <li><a class="dropdown-item" href="/documentation/javadoc/jena/">Jena Core</a></li>
                         <li><a class="dropdown-item" href="/documentation/javadoc/ontapi/">Jena OntAPI</a></li>
                         <li><a class="dropdown-item" href="/documentation/javadoc/fuseki2/">Fuseki</a></li>
                         <li><a class="dropdown-item" href="/documentation/javadoc/shacl/">SHACL</a></li>
                         <li><a class="dropdown-item" href="/documentation/javadoc/shex/">ShEx</a></li>
                         <li><a class="dropdown-item" href="/documentation/javadoc/tdb2/">TDB</a></li>
                         <li><a class="dropdown-item" href="/documentation/javadoc/text/">Text Search</a></li>
                         <li><a class="dropdown-item" href="/documentation/javadoc/extras/querybuilder/">Query Builder</a></li>
                         <li><a class="dropdown-item" href="/documentation/javadoc/geosparql/">GeoSPARQL</a></li>
                     </ul>
                 </li>

                 <li class="nav-item dropdown">
                     <a href="#" class="nav-link dropdown-toggle" role="button" data-bs-toggle="dropdown" aria-expanded="false"><span class="bi-journal-code"></span> Project <b class="caret"></b></a>
                     <ul class="dropdown-menu">
                       <li><a class="dropdown-item" href="https://github.com/apache/jena">Code</a></li>
                       <li><a class="dropdown-item" href="https://github.com/apache/jena/blob/main/CONTRIBUTING.md">Contributing</a></li>
                       <li><a class="dropdown-item" href="/security">Security Policies</a></li>
                       <li><a class="dropdown-item" href="/security/advisories.html">Security Advisories</a></li>
                     </ul>
                 </li>

             </ul>
             <form class="d-flex" role="search" action="/search" method="GET">
                 <div class="input-group">
                     <input class="form-control border-end-0 border m-0" type="search" name="q" id="search-query" placeholder="Search...." aria-label="Search" style="width: 10rem;">
                     <button class="btn btn-outline-secondary border-start-0 border" type="submit">
                         <i class="bi-search"></i>
                     </button>
                 </div>
             </form>
             <ul class="navbar-nav">
                 <li id="ask" class="nav-item"><a class="nav-link" href="/help_and_support/index.html" title="Ask"><span class="bi-patch-question"></span><span class="text-body d-none d-xxl-inline"> Ask</span></a></li>

                 <li class="nav-item dropdown">
                     <a href="#" title="Get involved" class="nav-link dropdown-toggle" role="button" data-bs-toggle="dropdown" aria-expanded="false"><span class="bi-megaphone"></span><span class="text-body d-none d-xxl-inline"> Get involved </span><b class="caret"></b></a>
                     <ul class="dropdown-menu">
                         <li><a class="dropdown-item" href="/getting_involved/index.html">Contribute</a></li>
                         <li><a class="dropdown-item" href="/help_and_support/bugs_and_suggestions.html">Report a bug</a></li>
                         <li class="dropdown-divider"></li>
                         <li class="dropdown-header">Project</li>
                         <li><a class="dropdown-item" href="/about_jena/about.html">About Jena</a></li>
                         <li><a class="dropdown-item" href="/about_jena/citing.html">Citing</a></li>
                         <li><a class="dropdown-item" href="/about_jena/team.html">Project team</a></li>
                         <li><a class="dropdown-item" href="/security">Security</a></li>
                         <li class="dropdown-divider"></li>
                         <li class="dropdown-header">ASF</li>
                         <li><a class="dropdown-item" href="https://www.apache.org/">Apache Software Foundation</a></li>
                         <li><a class="dropdown-item" href="https://www.apache.org/foundation/sponsorship.html">Become a Sponsor</a></li>
                         <li><a class="dropdown-item" href="https://www.apache.org/licenses/">License</a></li>
                         <li><a class="dropdown-item" href="https://www.apache.org/foundation/thanks.html">Thanks</a></li>
                         <li><a class="dropdown-item" href="https://www.apache.org/foundation/sponsorship.html">Donate</a></li>
                         <li><a class="dropdown-item" href="https://privacy.apache.org/policies/privacy-policy-public.html">Privacy</a></li>
                     </ul>
                 </li>


                 <li class="nav-item" id="edit"><a class="nav-link" href="https://github.com/apache/jena-site/edit/main/source/documentation/query/text-query.md" title="Edit this page on GitHub"><span class="bi-pencil-square"></span><span class="text-body d-none d-xxl-inline"> Edit this page</span></a></li>
             </ul>
         </div>
     </div>
 </nav>

 <div class="container">
     <div class="row">
         <div class="col-md-12">

             <div id="breadcrumbs">


 <ol class="breadcrumb mt-4 p-2 bg-body-tertiary">


                 <li class="breadcrumb-item"><a href='/documentation'>DOCUMENTATION</a></li>


                 <li class="breadcrumb-item"><a href='/documentation/query'>QUERY</a></li>


                 <li class="breadcrumb-item active">TEXT QUERY</li>


 </ol>


             </div>
             <h1 class="title">Jena Full Text Search</h1>


 <main class="d-flex flex-xl-row flex-column">

   <aside class="text-muted align-self-start mb-3 p-0 d-xl-none d-block">
     <h2 class="h6 sticky-top m-0 p-2 bg-body-tertiary">On this page</h2>
     <nav id="TableOfContents">
   <ul>
     <li><a href="#architecture">Architecture</a>
       <ul>
         <li><a href="#one-triple-equals-one-document">One triple equals one document</a></li>
         <li><a href="#one-document-equals-one-entity">One document equals one entity</a>
           <ul>
             <li><a href="#external-content">External content</a></li>
           </ul>
         </li>
         <li><a href="#external-applications">External applications</a></li>
         <li><a href="#document-structure">Document structure</a></li>
       </ul>
     </li>
     <li><a href="#query-with-sparql">Query with SPARQL</a>
       <ul>
         <li><a href="#syntax">Syntax</a>
           <ul>
             <li><a href="#input-arguments">Input arguments:</a></li>
             <li><a href="#output-arguments">Output arguments:</a></li>
           </ul>
         </li>
         <li><a href="#query-strings">Query strings</a>
           <ul>
             <li><a href="#simple-queries">Simple queries</a></li>
             <li><a href="#queries-with-language-tags">Queries with language tags</a></li>
             <li><a href="#queries-that-retrieve-literals">Queries that retrieve literals</a></li>
             <li><a href="#queries-with-graphs">Queries with graphs</a></li>
             <li><a href="#queries-across-multiple-fields">Queries across multiple <code>Field</code>s</a></li>
             <li><a href="#queries-with-boolean-operators-and-term-modifiers">Queries with <em>Boolean Operators</em> and <em>Term Modifiers</em></a></li>
             <li><a href="#highlighting">Highlighting</a></li>
           </ul>
         </li>
         <li><a href="#good-practice">Good practice</a>
           <ul>
             <li><a href="#query-pattern-1--find-in-the-text-index-and-refine-results">Query pattern 1 – Find in the text index and refine results</a></li>
             <li><a href="#query-pattern-2--filter-results-via-the-text-index">Query pattern 2 – Filter results via the text index</a></li>
           </ul>
         </li>
       </ul>
     </li>
     <li><a href="#configuration">Configuration</a>
       <ul>
         <li><a href="#text-dataset-assembler">Text Dataset Assembler</a>
           <ul>
             <li><a href="#lists-of-indexed-properties">Lists of Indexed Properties</a></li>
           </ul>
         </li>
         <li><a href="#entity-map-definition">Entity Map definition</a>
           <ul>
             <li><a href="#default-text-field">Default text field</a></li>
             <li><a href="#entity-field">Entity field</a></li>
             <li><a href="#uid-field-and-automatic-document-deletion">UID Field and automatic document deletion</a></li>
             <li><a href="#language-field">Language Field</a></li>
             <li><a href="#graph-field">Graph Field</a></li>
             <li><a href="#the-analyzer-map">The Analyzer Map</a></li>
           </ul>
         </li>
         <li><a href="#configuring-an-analyzer">Configuring an Analyzer</a>
           <ul>
             <li><a href="#configurableanalyzer">ConfigurableAnalyzer</a></li>
             <li><a href="#analyzer-for-query">Analyzer for Query</a></li>
             <li><a href="#alternative-query-parsers">Alternative Query Parsers</a></li>
           </ul>
         </li>
         <li><a href="#configuration-by-code">Configuration by Code</a></li>
         <li><a href="#graph-specific-indexing">Graph-specific Indexing</a></li>
         <li><a href="#linguistic-support-with-lucene-index">Linguistic support with Lucene index</a>
           <ul>
             <li><a href="#explicit-language-field-in-the-index">Explicit Language Field in the Index</a></li>
             <li><a href="#sparql-linguistic-clause-forms">SPARQL Linguistic Clause Forms</a></li>
             <li><a href="#localizedanalyzer">LocalizedAnalyzer</a></li>
             <li><a href="#multilingual-support">Multilingual Support</a></li>
           </ul>
         </li>
         <li><a href="#generic-and-defined-analyzer-support">Generic and Defined Analyzer Support</a>
           <ul>
             <li><a href="#generic-analyzers-tokenizers-and-filters">Generic Analyzers, Tokenizers and Filters</a></li>
             <li><a href="#defined-analyzers">Defined Analyzers</a></li>
             <li><a href="#extending-multilingual-support">Extending multilingual support</a></li>
             <li><a href="#multilingual-enhancements-for-multi-encoding-searches">Multilingual enhancements for multi-encoding searches</a></li>
             <li><a href="#naming-analyzers-for-later-use">Naming analyzers for later use</a></li>
           </ul>
         </li>
         <li><a href="#storing-literal-values">Storing Literal Values</a></li>
       </ul>
     </li>
     <li><a href="#working-with-fuseki">Working with Fuseki</a></li>
     <li><a href="#building-a-text-index">Building a Text Index</a>
       <ul>
         <li><a href="#step-1---building-a-tdb-dataset">Step 1 - Building a TDB dataset</a></li>
         <li><a href="#step-2---build-the-text-index">Step 2 - Build the Text Index</a>
           <ul>
             <li><a href="#updating-the-index">Updating the index</a></li>
           </ul>
         </li>
       </ul>
     </li>
   </ul>

   <ul>
     <li><a href="#default-behavior">Default Behavior</a>
       <ul>
         <li><a href="#example">Example</a></li>
       </ul>
     </li>
     <li><a href="#multiple-fields-per-document">Multiple fields per document</a></li>
     <li><a href="#maven-dependency">Maven Dependency</a></li>
   </ul>
 </nav>
   </aside>
   <article class="flex-column me-lg-4">
     <p>This extension to ARQ combines SPARQL and full text search via
 <a href="https://lucene.apache.org">Lucene</a>.
 It gives applications the ability to perform indexed full text
 searches within SPARQL queries. Here is a version compatibility table:</p>
 <table>
   <thead>
       <tr>
           <th> Jena </th>
           <th> Lucene </th>
           <th> Solr </th>
           <th> ElasticSearch </th>
       </tr>
   </thead>
   <tbody>
       <tr>
           <td>upto 3.2.0</td>
           <td>5.x or 6.x</td>
           <td>5.x or 6.x</td>
           <td>not supported</td>
       </tr>
       <tr>
           <td>3.3.0 - 3.9.0</td>
           <td>6.4.x</td>
           <td>not supported</td>
           <td>5.2.2 - 5.2.13</td>
       </tr>
       <tr>
           <td>3.10.0</td>
           <td>7.4.0</td>
           <td>not supported</td>
           <td>6.4.2</td>
       </tr>
       <tr>
           <td>3.15.0 - 3.17.0</td>
           <td>7.7.x</td>
           <td>not supported</td>
           <td>6.8.6</td>
       </tr>
       <tr>
           <td>4.0.0 - 4.6.1</td>
           <td>8.8.x</td>
           <td>not supported</td>
           <td>not supported</td>
       </tr>
       <tr>
           <td>4.7.0 - current</td>
           <td>9.4.x</td>
           <td>not supported</td>
           <td>not supported</td>
       </tr>
   </tbody>
 </table>
 <p>Note: In Lucene 9, the default setup of the <code>StandardAnalyzer</code> changed to having
 no stop words. For more details, see <a href="#configuring-an-analyzer">analyzer specifications</a> below.</p>
 <p>SPARQL allows the use of
 <a href="https://www.w3.org/TR/2013/REC-sparql11-query-20130321/#func-regex">regex</a>
 in <code>FILTER</code>s which is a test on a value retrieved earlier in the query
 so its use <em>is not indexed</em>. For example, if you&rsquo;re
 searching for occurrences of <code>&quot;printer&quot;</code> in the <code>rdfs:label</code> of a bunch
 of products:</p>
 <div class="highlight"><pre tabindex="0" style="background-color:#f8f8f8;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-sparql" data-lang="sparql"><span style="display:flex;"><span><span style="color:#a2f;font-weight:bold">PREFIX</span>   <span style="color:#00f;font-weight:bold">ex</span>: <span style="color:#a0a000">&lt;http://www.example.org/resources#&gt;</span>
 </span></span><span style="display:flex;"><span><span style="color:#a2f;font-weight:bold">PREFIX</span> <span style="color:#00f;font-weight:bold">rdfs</span>: <span style="color:#a0a000">&lt;http://www.w3.org/2000/01/rdf-schema#&gt;</span>
 </span></span><span style="display:flex;"><span>
 </span></span><span style="display:flex;"><span><span style="color:#a2f;font-weight:bold">SELECT</span> <span style="color:#b8860b">?s</span> <span style="color:#b8860b">?lbl</span>
 </span></span><span style="display:flex;"><span><span style="color:#a2f;font-weight:bold">WHERE</span> {
 </span></span><span style="display:flex;"><span>  <span style="color:#b8860b">?s</span> <span style="color:#a2f;font-weight:bold">a</span> <span style="color:#00f;font-weight:bold">ex</span>:<span style="color:#008000;font-weight:bold">Product</span> ;
 </span></span><span style="display:flex;"><span>     <span style="color:#00f;font-weight:bold">rdfs</span>:<span style="color:#008000;font-weight:bold">label</span> <span style="color:#b8860b">?lbl</span>
 </span></span><span style="display:flex;"><span>  <span style="color:#a2f;font-weight:bold">FILTER</span> <span style="color:#00a000">regex</span>(<span style="color:#b8860b">?lbl</span>, <span style="color:#b44">&#34;printer&#34;</span>, <span style="color:#b44">&#34;i&#34;</span>)
 </span></span><span style="display:flex;"><span>}
 </span></span></code></pre></div><p>then the search will need to examine <em>all</em> selected <code>rdfs:label</code>
 statements and apply the regular expression to each label in turn. If
 there are many such statements and many such uses of <code>regex</code>, then it
 may be appropriate to consider using this extension to take advantage of
 the performance potential of full text indexing.</p>
 <p>Text indexes provide additional information for accessing the RDF graph
 by allowing the application to have <em>indexed access</em> to the internal
 structure of string literals rather than treating such literals as
 opaque items.  Unlike <code>FILTER</code>, an index can set the values of variables.
 Assuming appropriate <a href="#configuration">configuration</a>, the
 above query can use full text search via the
 <a href="/documentation/query/extension.html#property-functions">ARQ property function extension</a>, <code>text:query</code>:</p>
 <pre><code>PREFIX   ex: &lt;http://www.example.org/resources#&gt;
 PREFIX rdfs: &lt;http://www.w3.org/2000/01/rdf-schema#&gt;
 PREFIX text: &lt;http://jena.apache.org/text#&gt;

 SELECT ?s ?lbl
 WHERE {
 	?s a ex:Product ;
 	   text:query (rdfs:label 'printer') ;
 	   rdfs:label ?lbl
 }
 </code></pre>
 <p>This query makes a text query for <code>'printer'</code> on the <code>rdfs:label</code>
 property; and then looks in the RDF data and retrieves the complete
 label for each match.</p>
 <p>The full text engine can be either <a href="http://lucene.apache.org/core">Apache
 Lucene</a> hosted with Jena on a single
 machine, or <a href="https://www.elastic.co/">Elasticsearch</a> for a large scale
 enterprise search application where the full text engine is potentially
 distributed across separate machines.</p>
 <p>This <a href="https://github.com/apache/jena/tree/main/jena-text/src/main/java/examples/">example code</a>
 illustrates creating an in-memory dataset with a Lucene index.</p>
 <h2 id="architecture">Architecture</h2>
 <p>In general, a text index engine (Lucene or Elasticsearch) indexes
 <em>documents</em> where each document is a collection of <em>fields</em>, the values
 of which are indexed so that searches matching contents of specified
 fields can return a reference to the document containing the fields with
 matching values.</p>
 <p>There are two models for extending Jena with text indexing and search:</p>
 <ul>
 <li>One Jena <em>triple</em> equals one Lucene <em>document</em></li>
 <li>One Lucene <em>document</em> equals one Jena <em>entity</em></li>
 </ul>
 <h3 id="one-triple-equals-one-document">One triple equals one document</h3>
 <p>The basic Jena text extension associates a triple with
 a document and the <em>property</em> of the triple with a <em>field</em> of a document
 and the <em>object</em> of the triple (which must be a literal) with the value
 of the field in the document. The <em>subject</em> of the triple then becomes
 another field of the document that is returned as the result of a search
 match to identify what was matched. (NB, the particular triple that
 matched is not identified. Only, its subject and <em>optionally</em> the matching
 literal and match score.)</p>
 <p>In this manner, the text index provides an inverted index that maps
 query string matches to subject URIs.</p>
 <p>A text-indexed dataset is configured with a description of which
 properties are to be indexed. When triples are added, any properties
 matching the description cause a document to be added to the index by
 analyzing the literal value of the triple object and mapping to the
 subject URI. On the other hand, it is necessary to specifically
 configure the text-indexed dataset to <a href="#entity-map-definition">delete index
 entries</a> when the corresponding triples are
 dropped from the RDF store.</p>
 <p>The text index uses the native query language of the index:
 <a href="http://lucene.apache.org/core/6_4_1/queryparser/org/apache/lucene/queryparser/classic/package-summary.html#package_description">Lucene query language</a>
 (with <a href="#input-arguments">restrictions</a>)
 or
 <a href="https://www.elastic.co/guide/en/elasticsearch/reference/5.2/query-dsl.html">Elasticsearch query language</a>.</p>
 <h3 id="one-document-equals-one-entity">One document equals one entity</h3>
 <p>There are two approaches to creating indexed documents that contain more
 than one indexed field:</p>
 <ul>
 <li>Using an externally maintained Lucene index</li>
 <li><a href="#multiple-fields-per-document">Multiple fields per document</a></li>
 </ul>
 <p>When using this integration model, <code>text:query</code> returns the <em>subject</em> URI
 for the document on which additional triples of metadata may be associated,
 and optionally the Lucene score for the match.</p>
 <h4 id="external-content">External content</h4>
 <p>When document content is externally indexed via Lucene and accessed in Jena
 via a <code>text:TextDataset</code> then the subject URI returned for a search result
 is considered to refer to the external content, and metadata about the
 document is represented as triples in Jena with the subject URI.</p>
 <p>There is no requirement that the indexed document content be present
 in the RDF data.  As long as the index contains the index text documents to
 match the index description, then text search can be performed with queries that explicitly mention indexed fields in the document.</p>
 <p>That is, if the content of a collection of documents is externally indexed
 and the URI naming the document is the result of the text search, then an RDF
 dataset with the document metadata can be combined with accessing the
 content by URI.</p>
 <p>The maintenance of the index is external to the RDF data store.</p>
 <h3 id="external-applications">External applications</h3>
 <p>By using Elasticsearch, other applications can share the text index with
 SPARQL search.</p>
 <h3 id="document-structure">Document structure</h3>
 <p>As mentioned above, when using the (<em>default</em>) one-triple equals one-document model,
 text indexing of a triple involves associating a Lucene document with the triple.
 How is this done?</p>
 <p>Lucene documents are composed of <code>Field</code>s. Indexing and searching are performed
 over the contents of these <code>Field</code>s. For an RDF triple to be indexed in Lucene the
 <em>property</em> of the triple must be
 <a href="#entity-map-definition">configured in the entity map of a TextIndex</a>.
 This associates a Lucene analyzer with the <em><code>property</code></em> which will be used
 for indexing and search. The <em><code>property</code></em> becomes the <em>searchable</em> Lucene
 <code>Field</code> in the resulting document.</p>
 <p>A Lucene index includes a <em>default</em> <code>Field</code>, which is specified in the configuration,
 that is the field to search if not otherwise named in the query. In jena-text
 this field is configured via the <code>text:defaultField</code> property which is then mapped
 to a specific RDF property via <code>text:predicate</code> (see <a href="#entity-map-definition">entity map</a>
 below).</p>
 <p>There are several additional <code>Field</code>s that will be included in the
 document that is passed to the Lucene <code>IndexWriter</code> depending on the
 configuration options that are used. These additional fields are used to
 manage the interface between Jena and Lucene and are not generally
 searchable per se.</p>
 <p>The most important of these additional <code>Field</code>s is the <code>text:entityField</code>.
 This configuration property defines the name of the <code>Field</code> that will contain
 the <em>URI</em> or <em>blank node id</em> of the <em>subject</em> of the triple being indexed. This property does
 not have a default and must be specified for most uses of <code>jena-text</code>. This
 <code>Field</code> is often given the name, <code>uri</code>, in examples. It is via this <code>Field</code>
 that <code>?s</code> is bound in a typical use such as:</p>
 <pre><code>select ?s
 where {
     ?s text:query &quot;some text&quot;
 }
 </code></pre>
 <p>Other <code>Field</code>s that may be configured: <code>text:uidField</code>, <code>text:graphField</code>,
 and so on are discussed below.</p>
 <p>Given the triple:</p>
 <pre><code>ex:SomeOne skos:prefLabel &quot;zorn protégé a prés&quot;@fr ;
 </code></pre>
 <p>The following is an abbreviated illustration a Lucene document that Jena will create and
 request Lucene to index:</p>
 <pre><code>Document&lt;
     &lt;uri:http://example.org/SomeOne&gt;
     &lt;graph:urn:x-arq:DefaultGraphNode&gt;
     &lt;label:zorn protégé a prés&gt;
     &lt;lang:fr&gt;
     &lt;uid:28959d0130121b51e1459a95bdac2e04f96efa2e6518ff3c090dfa7a1e6dcf00&gt;
     &gt;
 </code></pre>
 <p>It may be instructive to refer back to this example when considering the various
 points below.</p>
 <h2 id="query-with-sparql">Query with SPARQL</h2>
 <p>The URI of the text extension property function is
 <code>http://jena.apache.org/text#query</code> more conveniently written:</p>
 <pre><code>PREFIX text: &lt;http://jena.apache.org/text#&gt;

 ...   text:query ...
 </code></pre>
 <h3 id="syntax">Syntax</h3>
 <p>The following forms are all legal:</p>
 <pre><code>?s text:query 'word'                              # query
 ?s text:query ('word' 10)                         # with limit on results
 ?s text:query (rdfs:label 'word')                 # query specific property if multiple
 ?s text:query (rdfs:label 'protégé' 'lang:fr')    # restrict search to French
 (?s ?score) text:query 'word'                     # query capturing also the score
 (?s ?score ?literal) text:query 'word'            # ... and original literal value
 (?s ?score ?literal ?g) text:query 'word'         # ... and the graph
 </code></pre>
 <p>The most general form when using the <em>default</em> <a href="#one-triple-equals-one-document">one-triple equals one-document</a>
 integration model is:</p>
 <pre><code> ( ?s ?score ?literal ?g ) text:query ( property* 'query string' limit 'lang:xx' 'highlight:yy' )
 </code></pre>
 <p>while for the <a href="#one-document-equals-one-entity">one-document equals one-entity model</a>, the general form is:</p>
 <pre><code> ( ?s ?score ) text:query ( 'query string' limit )
 </code></pre>
 <p>and if only the <em>subject</em> URI is needed:</p>
 <pre><code> ?s text:query ( 'query string' limit )
 </code></pre>
 <h4 id="input-arguments">Input arguments:</h4>
 <table>
   <thead>
       <tr>
           <th> Argument </th>
           <th>  Definition </th>
       </tr>
   </thead>
   <tbody>
       <tr>
           <td>property</td>
           <td>(zero or more) property URIs (including prefix name form)</td>
       </tr>
       <tr>
           <td>query string</td>
           <td>Lucene query string fragment</td>
       </tr>
       <tr>
           <td>limit</td>
           <td>(optional) <code>int</code> limit on the number of results</td>
       </tr>
       <tr>
           <td>lang:xx</td>
           <td>(optional) language tag spec</td>
       </tr>
       <tr>
           <td>highlight:yy</td>
           <td>(optional) highlighting options</td>
       </tr>
   </tbody>
 </table>
 <p>The <code>property</code> URI is only necessary if multiple properties have been
 indexed and the property being searched over is not the <a href="#entity-map-definition">default field
 of the index</a>.</p>
 <p>Since 3.13.0, <code>property</code> may be a list of zero or more (prior to 3.13.0 zero or one) Lucene indexed properties, or a defined
 <a href="#lists-of-indexed-properties"><code>text:propList</code> of indexed properties</a>.
 The meaning is an <code>OR</code> of searches on a variety of properties. This can be used in place of SPARQL level <code>UNION</code>s of
 individual <code>text:query</code>s. For example, instead of:</p>
 <pre><code>select ?foo where {
   {
     (?s ?sc ?lit) text:query ( rdfs:label &quot;some query&quot; ).
   }
   union
   {
     (?s ?sc ?lit) text:query ( skos:altLabel &quot;some query&quot; ).
   }
   union
   {
     (?s ?sc ?lit) text:query ( skos:prefLabel &quot;some query&quot; ).
   }
 }
 </code></pre>
 <p>it can be more performant to push the unions into the Lucene query by rewriting as:</p>
 <pre><code>(?s ?sc ?lit) text:query ( rdfs:label skos:prefLabel skos:altLabel &quot;some query&quot; )
 </code></pre>
 <p>which creates a Lucene query:</p>
 <pre><code>(altLabel:&quot;some query&quot; OR prefLabel:&quot;some query&quot; OR label:&quot;some query&quot;)
 </code></pre>
 <p>The <code>query string</code> syntax conforms to the underlying
 <a href="http://lucene.apache.org/core/6_4_1/queryparser/org/apache/lucene/queryparser/classic/package-summary.html#package_description">Lucene</a>,
 or when appropriate,
 <a href="https://www.elastic.co/guide/en/elasticsearch/reference/5.2/query-dsl.html">Elasticsearch</a>.</p>
 <p>In the case of the <em>default</em> <a href="#one-triple-equals-one-document">one-triple equals one-document</a> model, the Lucene query syntax is restricted to <code>Terms</code>, <code>Term modifiers</code>,
 <code>Boolean Operators</code> applied to <code>Terms</code>, and <code>Grouping</code> of terms.</p>
 <p>Additionally, the use of <code>Fields</code> within the <code>query string</code> is supported when using the <a href="#one-document-equals-one-entity">one-document equals one-entity</a> text integration model.</p>
 <p>When using the <a href="#one-triple-equals-one-document">default model</a>,
 use of <code>Fields</code> in the query string <strong>will generally lead to unpredictable results.</strong></p>
 <p>The optional <code>limit</code> indicates the maximum hits to be returned by Lucene.</p>
 <p>The <code>lang:xx</code> specification is an optional string, where <em>xx</em> is
 a BCP-47 language tag. This restricts searches to field values that were originally
 indexed with the tag <em>xx</em>. Searches may be restricted to field values with no
 language tag via <code>&quot;lang:none&quot;</code>.</p>
 <p>The <code>highlight:yy</code> specification is an optional string where <em>yy</em> are options that control the highlighting of search
 result literals. See <a href="#highlighting">below</a> for details.</p>
 <p>If both <code>limit</code> and one or more of <code>lang:xx</code> or <code>highlight:yy</code> are present, then <code>limit</code> must precede these arguments.</p>
 <p>If only the query string is required, the surrounding <code>( )</code> <em>may be</em> omitted.</p>
 <h4 id="output-arguments">Output arguments:</h4>
 <table>
   <thead>
       <tr>
           <th> Argument </th>
           <th>  Definition </th>
       </tr>
   </thead>
   <tbody>
       <tr>
           <td>subject URI</td>
           <td>The subject of the indexed RDF triple.</td>
       </tr>
       <tr>
           <td>score</td>
           <td>(optional) The score for the match.</td>
       </tr>
       <tr>
           <td>literal</td>
           <td>(optional) The matched object literal.</td>
       </tr>
       <tr>
           <td>graph URI</td>
           <td>(optional) The graph URI of the triple.</td>
       </tr>
       <tr>
           <td>property URI</td>
           <td>(optional) The property URI of the matched triple</td>
       </tr>
   </tbody>
 </table>
 <p>The results include the <em>subject URI</em>; the <em>score</em> assigned by the
 text search engine; and the entire matched <em>literal</em> (if the index has
 been <a href="#text-dataset-assembler">configured to store literal values</a>).
 The <em>subject URI</em> may be a variable, e.g., <code>?s</code>, or a <em>URI</em>. In the
 latter case the search is restricted to triples with the specified
 subject. The <em>score</em>, <em>literal</em>, <em>graph URI</em>, and <em>property URI</em> <strong>must</strong> be variables.
 The <em>property URI</em> is meaningful when two or more properties are used in the query.</p>
 <h3 id="query-strings">Query strings</h3>
 <p>There are several points that need to be considered when formulating
 SPARQL queries using either of the Lucene integration models.</p>
 <p>As mentioned above, in the case of the <a href="#one-triple-equals-one-document">default model</a>
 the <code>query string</code> syntax is restricted to <code>Terms</code>, <code>Term modifiers</code>, <code>Boolean Operators</code>
 applied to <code>Terms</code>, and <code>Grouping</code> of terms.</p>
 <p>Explicit use of <code>Fields</code> in the <em>query string</em> is only useful with the
 <a href="#one-document-equals-one-entity">one-document equals one-entity model</a>;
 and otherwise will generally produce unexpected results.
 See <a href="#queries-across-multiple-fields">Queries across multiple <code>Fields</code></a>.</p>
 <h4 id="simple-queries">Simple queries</h4>
 <p>The simplest use of the jena-text Lucene integration is like:</p>
 <pre><code>?s text:query &quot;some phrase&quot;
 </code></pre>
 <p>This will bind <code>?s</code> to each entity URI that is the subject of a triple
 that has the default property and an object literal that matches
 the argument string, e.g.:</p>
 <pre><code>ex:AnEntity skos:prefLabel &quot;this is some phrase to match&quot;
 </code></pre>
 <p>This query form will indicate the <em>subjects</em> that have literals that match
 for the <em>default property</em> which is determined via the configuration of
 the <code>text:predicate</code> of the <a href="#default-text-field"><code>text:defaultField</code></a>
 (in the above this has been assumed to be <code>skos:prefLabel</code>.</p>
 <p>For a <em>non-default property</em> it is necessary to specify the property as
 an input argument to the <code>text:query</code>:</p>
 <pre><code>?s text:query (rdfs:label &quot;protégé&quot;)
 </code></pre>
 <p>(see <a href="#entity-map-definition">below</a> for how RDF <em>property</em> names
 are mapped to Lucene <code>Field</code> names).</p>
 <p>If this use case is sufficient for your needs you can skip on to the
 <a href="#configuration">sections on configuration</a>.</p>
 <p><strong>Please note</strong> that the query:</p>
 <pre><code>?s text:query &quot;some phrase&quot;
 </code></pre>
 <p>when using the Lucene <code>StandardAnalyzer</code> or similar will treat the query string
 as an <code>OR</code> of terms: <code>some</code> and <code>phrase</code>. If a phrase search is required then
 it is necessary to surround the phrase by double quotes, <code>&quot;</code>:</p>
 <pre><code>?s text:query &quot;\&quot;some phrase\&quot;&quot;
 </code></pre>
 <p>This will only match strings that contain <code>&quot;some phrase&quot;</code>, while the former
 query will match strings like: <code>&quot;there is a phrase for some&quot;</code> or
 <code>&quot;this is some of the various sorts of phrase that might be matched&quot;</code>.</p>
 <h4 id="queries-with-language-tags">Queries with language tags</h4>
 <p>When working with <code>rdf:langString</code>s it is necessary that the
 <a href="#language-field"><code>text:langField</code></a> has been configured. Then it is
 as simple as writing queries such as:</p>
 <pre><code>?s text:query &quot;protégé&quot;@fr
 </code></pre>
 <p>to return results where the given term or phrase has been
 indexed under French in the <a href="#default-text-field"><code>text:defaultField</code></a>.</p>
 <p>It is also possible to use the optional <code>lang:xx</code> argument, for example:</p>
 <pre><code>?s text:query (&quot;protégé&quot; 'lang:fr') .
 </code></pre>
 <p>In general, the presence of a language tag, <code>xx</code>, on the <code>query string</code> or
 <code>lang:xx</code> in the <code>text:query</code> adds <code>AND lang:xx</code> to the query sent to Lucene,
 so the above example becomes the following Lucene query:</p>
 <pre><code>&quot;label:protégé AND lang:fr&quot;
 </code></pre>
 <p>For <em>non-default properties</em> the general form is used:</p>
 <pre><code>?s text:query (skos:altLabel &quot;protégé&quot; 'lang:fr')
 </code></pre>
 <p>Note that an explicit language tag on the <code>query string</code> takes precedence
 over the <code>lang:xx</code>, so the following</p>
 <pre><code>?s text:query (&quot;protégé&quot;@fr 'lang:none')
 </code></pre>
 <p>will find French matches rather than matches indexed without a language tag.</p>
 <h4 id="queries-that-retrieve-literals">Queries that retrieve literals</h4>
 <p>It is possible to retrieve the <em>literal</em>s that Lucene finds matches for
 assuming that</p>
 <pre><code>&lt;#TextIndex#&gt; text:storeValues true ;
 </code></pre>
 <p>has been specified in the <code>TextIndex</code> configuration. So</p>
 <pre><code>(?s ?sc ?lit) text:query (rdfs:label &quot;protégé&quot;)
 </code></pre>
 <p>will bind the matching literals to <code>?lit</code>, e.g.,</p>
 <pre><code>&quot;zorn protégé a prés&quot;@fr
 </code></pre>
 <p>Note it is necessary to include a variable to capture the Lucene <em>score</em>
 even if this value is not otherwise needed since the <em>literal</em> variable
 is determined by position.</p>
 <h4 id="queries-with-graphs">Queries with graphs</h4>
 <p>Assuming that the <a href="#graph-field"><code>text:graphField</code></a> has been configured,
 then, when a triple is indexed, the graph that the triple resides in is
 included in the document and may be used to restrict searches or to retrieve the graph that a matching triple resides in.</p>
 <p>For example:</p>
 <pre><code>select ?s ?lit
 where {
   graph ex:G2 { (?s ?sc ?lit) text:query &quot;zorn&quot; } .
 }
 </code></pre>
 <p>will restrict searches to triples with the <em>default property</em> that reside
 in graph, <code>ex:G2</code>.</p>
 <p>On the other hand:</p>
 <pre><code>select ?g ?s ?lit
 where {
   graph ?g { (?s ?sc ?lit) text:query &quot;zorn&quot; } .
 }
 </code></pre>
 <p>will iterate over the graphs in the dataset, searching each in turn for
 matches.</p>
 <p>If there is suitable structure to the graphs, e.g., a known <code>rdf:type</code> and
 depending on the selectivity of the text query and number of graphs,
 it may be more performant to express the query as follows:</p>
 <pre><code>select ?g ?s ?lit
 where {
   (?s ?sc ?lit) text:query &quot;zorn&quot; .
   graph ?g { ?s a ex:Item } .
 }
 </code></pre>
 <p>Further, if <code>tdb:unionDefaultGraph true</code> for a TDB dataset backing a Lucene index then it is possible to retrieve the graphs that contain triples resulting from a Lucene search via the fourth output argument to <code>text:query</code>:</p>
 <pre><code>select ?g ?s ?lit
 where {
   (?s ?sc ?lit ?g) text:query &quot;zorn&quot; .
 }
 </code></pre>
 <p>This will generally perform much better than either of the previous approaches when there are
 large numbers of graphs since the Lucene search will run once and the returned <em>documents</em> carry
 the containing graph URI for free as it were.</p>
 <h4 id="queries-across-multiple-fields">Queries across multiple <code>Field</code>s</h4>
 <p>As mentioned earlier, the Lucene text index uses the
 <a href="http://lucene.apache.org/core/6_4_1/queryparser/org/apache/lucene/queryparser/classic/package-summary.html#package_description">native Lucene query language</a>.</p>
 <h5 id="multiple-fields-in-the-default-integration-model">Multiple fields in the default integration model</h5>
 <p>For the <a href="#one-triple-equals-one-document">default integration model</a>, since each document
 has only one field containing searchable text, searching for documents containing
 multiple fields will generally not find any results.</p>
 <p>Note that the <a href="#one-triple-equals-one-document">default model</a> provides three Lucene <code>Fields</code>
 in a document that are used during searching:</p>
 <ol>
 <li>the field corresponding to the property of the indexed triple,</li>
 <li>the field for the language of the literal (if configured), and</li>
 <li>the graph that the triple is in (if configured).</li>
 </ol>
 <p>Given these, it should be clear from the above that the
 <a href="#one-triple-equals-one-document">default model</a>
 constructs a Lucene query from the <em>property</em>, <em>query string</em>, <code>lang:xx</code>, and
 SPARQL graph arguments.</p>
 <p>For example, consider the following triples:</p>
 <pre><code>ex:SomePrinter
     rdfs:label     &quot;laser printer&quot; ;
     ex:description &quot;includes a large capacity cartridge&quot; .
 </code></pre>
 <p>assuming an appropriate configuration, if we try to retrieve <code>ex:SomePrinter</code>
 with the following Lucene <code>query string</code>:</p>
 <pre><code>?s text:query &quot;label:printer AND description:\&quot;large capacity cartridge\&quot;&quot;
 </code></pre>
 <p>then this query can not find the expected results since the <code>AND</code> is interpreted
 by Lucene to indicate that all documents that contain a matching <code>label</code> field <em>and</em>
 a matching <code>description</code> field are to be returned; yet, from the discussion above
 regarding the <a href="#document-structure">structure of Lucene documents in jena-text</a> it
 is evident that there is not one but rather in fact two separate documents one with a
 <code>label</code> field and one with a <code>description</code> field so an effective SPARQL query is:</p>
 <pre><code>?s text:query (rdfs:label &quot;printer&quot;) .
 ?s text:query (ex:description &quot;large capacity cartridge&quot;) .
 </code></pre>
 <p>which leads to <code>?s</code> being bound to <code>ex:SomePrinter</code>.</p>
 <p>In other words when a query is to involve two or more <em>properties</em> of a given <em>entity</em>
 then it is expressed at the SPARQL level, as it were, versus in Lucene&rsquo;s query language.</p>
 <p>It is worth noting that the equivalent of a Lucene <code>OR</code> of <code>Fields</code> can be expressed
 using SPARQL <code>union</code>, though since 3.13.0 this can be expressed in Jena text
 using a property list - see <a href="#input-arguments">Input arguments</a>:</p>
 <pre><code>{ ?s text:query (rdfs:label &quot;printer&quot;) . }
 union
 { ?s text:query (ex:description &quot;large capacity cartridge&quot;) . }
 </code></pre>
 <p>Suppose the matching literals are required for the above then it should be clear
 from the above that:</p>
 <pre><code>(?s ?sc1 ?lit1) text:query (skos:prefLabel &quot;printer&quot;) .
 (?s ?sc2 ?lit2) text:query (ex:description &quot;large capacity cartridge&quot;) .
 </code></pre>
 <p>will be the appropriate form to retrieve the <em>subject</em> and the associated literals, <code>?lit1</code> and <code>?lit2</code>. (Obviously, in general, the <em>score</em> variables, <code>?sc1</code> and <code>?sc2</code>
 must be distinct since it is very unlikely that the scores of the two Lucene queries
 will ever match).</p>
 <p>There is no loss of expressiveness of the Lucene query language versus the jena-text
 integration of Lucene. Any cross-field <code>AND</code>s are replaced by concurrent SPARQL calls to
 text:query as illustrated above and uses of Lucene <code>OR</code> can be converted to SPARQL
 <code>union</code>s. Uses of Lucene <code>NOT</code> are converted to appropriate SPARQL <code>filter</code>s.</p>
 <h5 id="multiple-fields-in-the-one-document-equals-one-entity-model">Multiple fields in the one-document equals one-entity model</h5>
 <p>If Lucene documents have been indexed with <a href="#one-document-equals-one-entity">multiple searchable fields</a>
 then compound queries expressed directly in the Lucene query language can significantly improve search
 performance, in particular, where the individual components of the Lucene query generate
 a lot of results which must be combined in SPARQL.</p>
 <p>It is possible to have text queries that search multiple fields within a text query.
 Doing this is more complex as it requires the use of either an externally managed
 text index or code must be provided to build the multi-field text documents to be indexed.
 See <a href="#multiple-fields-per-document">Multiple fields per document</a>.</p>
 <h4 id="queries-with-boolean-operators-and-term-modifiers">Queries with <em>Boolean Operators</em> and <em>Term Modifiers</em></h4>
 <p>On the other hand the various features of the <a href="http://lucene.apache.org/core/6_4_1/queryparser/org/apache/lucene/queryparser/classic/package-summary.html#package_description">Lucene query language</a>
 are all available to be used for searches within a <code>Field</code>.
 For example, <em>Boolean Operators</em> on <em>Terms</em>:</p>
 <pre><code>?s text:query (ex:description &quot;(large AND cartridge)&quot;)
 </code></pre>
 <p>and</p>
 <pre><code>(?s ?sc ?lit) text:query (ex:description &quot;(includes AND (large OR capacity))&quot;)
 </code></pre>
 <p>or <em>fuzzy</em> searches:</p>
 <pre><code>?s text:query (ex:description &quot;include~&quot;)
 </code></pre>
 <p>and so on will work as expected.</p>
 <p><strong>Always surround the query string with <code>( )</code> if more than a single term or phrase
 are involved.</strong></p>
 <h4 id="highlighting">Highlighting</h4>
 <p>The highlighting option uses the Lucene <code>Highlighter</code> and <code>SimpleHTMLFormatter</code> to insert highlighting markup into the literals returned from search results (hence the text dataset must be configured to store the literals). The highlighted results are returned via the <em>literal</em> output argument. This highlighting feature, introduced in version 3.7.0, does not require re-indexing by Lucene.</p>
 <p>The simplest way to request highlighting is via <code>'highlight:'</code>. This will apply all the defaults:</p>
 <table>
   <thead>
       <tr>
           <th> Option </th>
           <th> Key </th>
           <th> Default </th>
       </tr>
   </thead>
   <tbody>
       <tr>
           <td>maxFrags</td>
           <td>m:</td>
           <td>3</td>
       </tr>
       <tr>
           <td>fragSize</td>
           <td>z:</td>
           <td>128</td>
       </tr>
       <tr>
           <td>start</td>
           <td>s:</td>
           <td>RIGHT_ARROW</td>
       </tr>
       <tr>
           <td>end</td>
           <td>e:</td>
           <td>LEFT_ARROW</td>
       </tr>
       <tr>
           <td>fragSep</td>
           <td>f:</td>
           <td>DIVIDES</td>
       </tr>
       <tr>
           <td>joinHi</td>
           <td>jh:</td>
           <td>true</td>
       </tr>
       <tr>
           <td>joinFrags</td>
           <td>jf:</td>
           <td>true</td>
       </tr>
   </tbody>
 </table>
 <p>to the highlighting of the search results. For example if the query is:</p>
 <pre><code>(?s ?sc ?lit) text:query ( &quot;brown fox&quot; &quot;highlight:&quot; )
 </code></pre>
 <p>then a resulting literal binding might be:</p>
 <pre><code>&quot;the quick ↦brown fox↤ jumped over the lazy baboon&quot;
 </code></pre>
 <p>The <code>RIGHT_ARROW</code> is Unicode \u21a6 and the <code>LEFT_ARROW</code> is Unicode \u21a4. These are chosen to be single characters that in most situations will be very unlikely to occur in resulting literals. The <code>fragSize</code> of 128 is chosen to be large enough that in many situations the matches will result in single fragments. If the literal is larger than 128 characters and there are several matches in the literal then there may be additional fragments separated by the <code>DIVIDES</code>, Unicode \u2223.</p>
 <p>Depending on the analyzer used and the tokenizer, the highlighting will result in marking each token rather than an entire phrase. The <code>joinHi</code> option is by default <code>true</code> so that entire phrases are highlighted together rather than as individual tokens as in:</p>
 <pre><code>&quot;the quick ↦brown↤ ↦fox↤ jumped over the lazy baboon&quot;
 </code></pre>
 <p>which would result from:</p>
 <pre><code>(?s ?sc ?lit) text:query ( &quot;brown fox&quot; &quot;highlight:jh:n&quot; )
 </code></pre>
 <p>The <code>jh</code> and <code>jf</code> boolean options are set <code>false</code> via <code>n</code>. Any other value is <code>true</code>. The defaults for these options have been selected to be reasonable for most applications.</p>
 <p>The joining is performed post highlighting via Java <code>String replaceAll</code> rather than using the Lucene Unified Highlighter facility which requires that term vectors and positions be stored. The joining deletes <em>extra</em> highlighting with only intervening Unicode separators, <code>\p{Z}</code>.</p>
 <p>The more conventional output of the Lucene <code>SimpleHTMLFormatter</code> with html emphasis markup is achieved via, <code>&quot;highlight:s:&lt;em class='hiLite'&gt; | e:&lt;/em&gt;&quot;</code> (highlight options are separated by a Unicode vertical line, \u007c. The spaces are not necessary). The result with the above example will be:</p>
 <pre><code>&quot;the quick &lt;em class='hiLite'&gt;brown fox&lt;/em&gt; jumped over the lazy baboon&quot;
 </code></pre>
 <p>which would result from the query:</p>
 <pre><code>(?s ?sc ?lit) text:query ( &quot;brown fox&quot; &quot;highlight:s:&lt;em class='hiLite'&gt; | e:&lt;/em&gt;&quot; )
 </code></pre>
 <h3 id="good-practice">Good practice</h3>
 <p>From the above it should be clear that best practice, except in the simplest cases
 is to use explicit <code>text:query</code> forms such as:</p>
 <pre><code>(?s ?sc ?lit) text:query (ex:someProperty &quot;a single Field query&quot;)
 </code></pre>
 <p>possibly with <em>limit</em> and <code>lang:xx</code> arguments.</p>
 <p>Further, the query engine does not have information about the selectivity of the
 text index and so effective query plans cannot be determined
 programmatically.  It is helpful to be aware of the following two
 general query patterns.</p>
 <h4 id="query-pattern-1--find-in-the-text-index-and-refine-results">Query pattern 1 – Find in the text index and refine results</h4>
 <p>Access to the text index is first in the query and used to find a number of
 items of interest; further information is obtained about these items from
 the RDF data.</p>
 <pre><code>SELECT ?s
 { ?s text:query (rdfs:label 'word' 10) ;
      rdfs:label ?label ;
      rdf:type   ?type
 }
 </code></pre>
 <p>The <code>text:query</code> limit argument is useful when working with large indexes to limit results to the
 higher scoring results – results are returned in the order of scoring by the text search engine.</p>
 <h4 id="query-pattern-2--filter-results-via-the-text-index">Query pattern 2 – Filter results via the text index</h4>
 <p>By finding items of interest first in the RDF data, the text search can be
 used to restrict the items found still further.</p>
 <pre><code>SELECT ?s
 { ?s rdf:type     :book ;
      dc:creator  &quot;John&quot; .
   ?s text:query   (dc:title 'word') ;
 }
 </code></pre>
 <h2 id="configuration">Configuration</h2>
 <p>The usual way to describe a text index is with a
 <a href="../assembler/index.html">Jena assembler description</a>.  Configurations can
 also be built with code. The assembler describes a &rsquo;text
 dataset&rsquo; which has an underlying RDF dataset and a text index. The text
 index describes the text index technology (Lucene or Elasticsearch) and the details
 needed for each.</p>
 <p>A text index has an &ldquo;entity map&rdquo; which defines the properties to
 index, the name of the Lucene/Elasticsearch field and field used for storing the URI
 itself.</p>
 <p>For simple RDF use, there will be one field, mapping a property to a text
 index field. More complex setups, with multiple properties per entity
 (URI) are possible.</p>
 <p>The assembler file can be either default configuration file (&hellip;/run/config.ttl)
 or a custom file in &hellip;run/configuration folder. Note that you can use several files
 simultaneously.</p>
 <p>You have to edit the file (see comments in the assembler code below):</p>
 <ol>
 <li>provide values for paths and a fixed URI for tdb:DatasetTDB</li>
 <li>modify the entity map : add the fields you want to index and desired options (filters, tokenizers&hellip;)</li>
 </ol>
 <p>If your assembler file is run/config.ttl, you can index the dataset with this command :</p>
 <pre><code>java -cp ./fuseki-server.jar jena.textindexer --desc=run/config.ttl
 </code></pre>
 <p>Once configured, any data added to the text dataset is automatically
 indexed as well: <a href="#building-a-text-index">Building a Text Index</a>.</p>
 <h3 id="text-dataset-assembler">Text Dataset Assembler</h3>
 <p>The following is an example of an assembler file defining a TDB dataset with a Lucene text index.</p>
 <pre><code>######## Example of a TDB dataset and text index#########################
 # The main doc sources are:
 #  - https://jena.apache.org/documentation/fuseki2/fuseki-configuration.html
 #  - https://jena.apache.org/documentation/assembler/assembler-howto.html
 #  - https://jena.apache.org/documentation/assembler/assembler.ttl
 # See https://jena.apache.org/documentation/fuseki2/fuseki-layout.html for the destination of this file.
 #########################################################################

 PREFIX :        &lt;http://localhost/jena_example/#&gt;
 PREFIX rdf:     &lt;http://www.w3.org/1999/02/22-rdf-syntax-ns#&gt;
 PREFIX rdfs:    &lt;http://www.w3.org/2000/01/rdf-schema#&gt;
 PREFIX tdb:     &lt;http://jena.hpl.hp.com/2008/tdb#&gt;
 PREFIX text:    &lt;http://jena.apache.org/text#&gt;
 PREFIX skos:    &lt;http://www.w3.org/2004/02/skos/core#&gt;
 PREFIX fuseki:  &lt;http://jena.apache.org/fuseki#&gt;

 [] rdf:type fuseki:Server ;
    fuseki:services (
      :myservice
    ) .

 :myservice rdf:type fuseki:Service ;
     # e.g : `s-query --service=http://localhost:3030/myds &quot;select * ...&quot;`
     fuseki:name               &quot;myds&quot; ;
     # SPARQL query service : /myds
     fuseki:endpoint [
         fuseki:operation fuseki:query ;
     ];
     # SPARQL query service : /myds/query
     fuseki:endpoint [
         fuseki:operation fuseki:query ;
         fuseki:name &quot;query&quot;
     ];
     # SPARQL update service : /myds/update
     fuseki:endpoint [
         fuseki:operation fuseki:update ;
         fuseki:name &quot;update&quot;
     ];
     # SPARQL Graph store protocol (read and write) : /myds/data
     fuseki:endpoint [
         fuseki:operation fuseki:gsp-rw ;
         fuseki:name &quot;data&quot;
     ];
     # The text-enabled dataset
     fuseki:dataset                    :text_dataset ;
     .

 ## ---------------------------------------------------------------

 # A TextDataset is a regular dataset with a text index.
 :text_dataset rdf:type     text:TextDataset ;
     text:dataset   :mydataset ; # &lt;-- replace `:my_dataset` with the desired URI
     text:index     &lt;#indexLucene&gt; ;
 .

 # A TDB dataset used for RDF storage
 :mydataset rdf:type      tdb:DatasetTDB ; # &lt;-- replace `:my_dataset` with the desired URI - as above
     tdb:location &quot;DB&quot; ;
     tdb:unionDefaultGraph true ; # Optional
 .

 # Text index description
 &lt;#indexLucene&gt; a text:TextIndexLucene ;
     text:directory &lt;file:path&gt; ;  # &lt;-- replace `&lt;file:path&gt;` with your path (e.g., `&lt;file:/.../fuseki/run/databases/MY_INDEX&gt;`)
     text:entityMap &lt;#entMap&gt; ;
     text:storeValues true ;
     text:analyzer [ a text:StandardAnalyzer ] ;
     text:queryAnalyzer [ a text:KeywordAnalyzer ] ;
     text:queryParser text:AnalyzingQueryParser ;
     text:propLists ( [ . . . ] . . . ) ;
     text:defineAnalyzers ( [ . . . ] . . . ) ;
     text:multilingualSupport true ; # optional
 .
 # Entity map (see documentation for other options)
 &lt;#entMap&gt; a text:EntityMap ;
     text:defaultField     &quot;label&quot; ;
     text:entityField      &quot;uri&quot; ;
     text:uidField         &quot;uid&quot; ;
     text:langField        &quot;lang&quot; ;
     text:graphField       &quot;graph&quot; ;
     text:map (
         [ text:field &quot;label&quot; ;
           text:predicate skos:prefLabel ]
     ) .
 </code></pre>
 <p>See below for <a href="#entity-map-definition">more on defining an entity map</a></p>
 <p>The <code>text:TextDataset</code> has two properties:</p>
 <ul>
 <li>
 <p>a <code>text:dataset</code>, e.g., a <code>tdb:DatasetTDB</code>, to contain
 the RDF triples; and</p>
 </li>
 <li>
 <p>an index configured to use either <code>text:TextIndexLucene</code> or <code>text:TextIndexES</code>.</p>
 </li>
 </ul>
 <p>The <code>&lt;#indexLucene&gt;</code> instance of <code>text:TextIndexLucene</code>, above, has two required properties:</p>
 <ul>
 <li>
 <p>the <code>text:directory</code>
 file URI which specifies the directory that will contain the Lucene index files – if this has the
 value <code>&quot;mem&quot;</code> then the index resides in memory;</p>
 </li>
 <li>
 <p>the <code>text:entityMap</code>, <code>&lt;#entMap&gt;</code> that will define
 what properties are to be indexed and other features of the index; and</p>
 </li>
 </ul>
 <p>and several optional properties:</p>
 <ul>
 <li>
 <p><code>text:storeValues</code> controls the <a href="#storing-literal-values">storing of literal values</a>.
 It indicates whether values are stored or not – values must be stored for the
 <a href="#query-with-sparql"><code>?literal</code> return value</a> to be available in <code>text:query</code> in SPARQL.</p>
 </li>
 <li>
 <p><code>text:analyzer</code> specifies the default <a href="#configuring-an-analyzer">analyzer configuration</a> to used
 during indexing and querying. The default analyzer defaults to Lucene&rsquo;s <code>StandardAnalyzer</code>.</p>
 </li>
 <li>
 <p><code>text:queryAnalyzer</code> specifies an optional <a href="#analyzer-for-query">analyzer for query</a> that will be
 used to analyze the query string. If not set the analyzer used to index a given field is used.</p>
 </li>
 <li>
 <p><code>text:queryParser</code> is optional and specifies an <a href="#alternative-query-parsers">alternative query parser</a></p>
 </li>
 <li>
 <p><code>text:propLists</code> is optional and allows to specify <a href="#lists-of-indexed-properties">lists of indexed properties</a> for use in <code>text:query</code></p>
 </li>
 <li>
 <p><code>text:defineAnalyzers</code> is optional and allows specification of <a href="#defined-analyzers">additional analyzers, tokenizers and filters</a></p>
 </li>
 <li>
 <p><code>text:multilingualSupport</code> enables <a href="#multilingual-support">Multilingual Support</a></p>
 </li>
 </ul>
 <p>If using Elasticsearch then an index would be configured as follows:</p>
 <pre><code>&lt;#indexES&gt; a text:TextIndexES ;
       # A comma-separated list of Host:Port values of the ElasticSearch Cluster nodes.
     text:serverList &quot;127.0.0.1:9300&quot; ;
       # Name of the ElasticSearch Cluster. If not specified defaults to 'elasticsearch'
     text:clusterName &quot;elasticsearch&quot; ;
       # The number of shards for the index. Defaults to 1
     text:shards &quot;1&quot; ;
       # The number of replicas for the index. Defaults to 1
     text:replicas &quot;1&quot; ;
       # Name of the Index. defaults to jena-text
     text:indexName &quot;jena-text&quot; ;
     text:entityMap &lt;#entMap&gt; ;
     .
 </code></pre>
 <p>and <code>text:index  &lt;#indexES&gt; ;</code> would be used in the configuration of <code>:text_dataset</code>.</p>
 <p>To use a text index assembler configuration in Java code is it necessary
 to identify the dataset URI to be assembled, such as in:</p>
 <pre><code>Dataset ds = DatasetFactory.assemble(
     &quot;text-config.ttl&quot;,
     &quot;http://localhost/jena_example/#text_dataset&quot;) ;
 </code></pre>
 <p>since the assembler contains two dataset definitions, one for the text
 dataset, one for the base data.  Therefore, the application needs to
 identify the text dataset by it&rsquo;s URI
 <code>http://localhost/jena_example/#text_dataset</code>.</p>
 <h4 id="lists-of-indexed-properties">Lists of Indexed Properties</h4>
 <p>Since 3.13.0, an optional <code>text:TextIndexLucene</code> feature, <code>text:propLists</code> allows to define lists of Lucene indexed
 properties that may be used in <code>text:query</code>s. For example:</p>
 <pre><code>text:propLists (
     [ text:propListProp ex:labels ;
       text:props ( skos:prefLabel
                    skos:altLabel
                    rdfs:label ) ;
     ]
     [ text:propListProp ex:workStmts ;
       text:props ( ex:workColophon
                    ex:workAuthorshipStatement
                    ex:workEditionStatement ) ;
     ]
 ) ;
 </code></pre>
 <p>The <code>text:propLists</code> is a list of <em>property list</em> definitions. Each <em>property list</em> defines a new property,
 <code>text:propListProp</code> that will be used to refer to the list in a <code>text:query</code>, for example, <code>ex:labels</code> and
 <code>ex:workStmts</code>, above. The <code>text:props</code> is a list of Lucene indexed properties that will be searched over when the
 <em>property list</em> property is referred to in a <code>text:query</code>. For example:</p>
 <pre><code>?s text:query ( ex:labels &quot;some text&quot; ) .
 </code></pre>
 <p>will request Lucene to search for documents representing triples, ?s ?p ?o, where ?p is one of: <code>rdfs:label</code> OR
 <code>skos:prefLbael</code> OR <code>skos:altLabel</code>, matching the query string.</p>
 <h3 id="entity-map-definition">Entity Map definition</h3>
 <p>A <code>text:EntityMap</code> has several properties that condition what is indexed, what information is stored, and
 what analyzers are used.</p>
 <pre><code>&lt;#entMap&gt; a text:EntityMap ;
     text:defaultField     &quot;label&quot; ;
     text:entityField      &quot;uri&quot; ;
     text:uidField         &quot;uid&quot; ;
     text:langField        &quot;lang&quot; ;
     text:graphField       &quot;graph&quot; ;
     text:map (
          [ text:field &quot;label&quot; ;
            text:predicate rdfs:label ]
          ) .
 </code></pre>
 <h4 id="default-text-field">Default text field</h4>
 <p>The <code>text:defaultField</code> specifies the default field name that Lucene will use in a query that does
 not otherwise specify a field. For example,</p>
 <pre><code>?s text:query &quot;\&quot;bread and butter\&quot;&quot;
 </code></pre>
 <p>will perform a search in the <code>label</code> field for the phrase <code>&quot;bread and butter&quot;</code></p>
 <h4 id="entity-field">Entity field</h4>
 <p>The <code>text:entityField </code> specifies the field name of the field that will contain the subject URI that
 is returned on a match. The value of the property is arbitrary so long as it is unique among the
 defined names.</p>
 <h4 id="uid-field-and-automatic-document-deletion">UID Field and automatic document deletion</h4>
 <p>When the <code>text:uidField</code> is defined in the <code>EntityMap</code> then dropping a triple will result in the
 corresponding document, if any, being deleted from the text index. The value, <code>&quot;uid&quot;</code>, is arbitrary
 and defines the name of a stored field in Lucene that holds a unique ID that represents the triple.</p>
 <p>If you configure the index via Java code, you need to set this parameter to the
 EntityDefinition instance, e.g.</p>
 <pre><code>EntityDefinition docDef = new EntityDefinition(entityField, defaultField);
 docDef.setUidField(&quot;uid&quot;);
 </code></pre>
 <p><strong>Note</strong>: If you migrate from an index without deletion support to an index with automatic deletion,
 you will need to rebuild the index to ensure that the uid information is stored.</p>
 <h4 id="language-field">Language Field</h4>
 <p>The <code>text:langField</code> is the name of the field that will store the language attribute of the literal
 in the case of an <code>rdf:langString</code>. This Entity Map property is a key element of the
 <a href="#linguistic-support-with-lucene-index">Linguistic support with Lucene index</a></p>
 <h4 id="graph-field">Graph Field</h4>
 <p>Setting the <code>text:graphField</code> allows <a href="#graph-specific-indexing">graph-specific indexing</a> of the text
 index to limit searching to a specified graph when a SPARQL query targets a single named graph. The
 field value is arbitrary and serves to store the graph ID that a triple belongs to when the index is
 updated.</p>
 <h4 id="the-analyzer-map">The Analyzer Map</h4>
 <p>The <code>text:map</code> is a list of <a href="#configuring-an-analyzer">analyzer specifications</a> as described below.</p>
 <h3 id="configuring-an-analyzer">Configuring an Analyzer</h3>
 <p>Text to be indexed is passed through a text analyzer that divides it into tokens
 and may perform other transformations such as eliminating stop words. If a Lucene
 or Elasticsearch text index is used, then by default the Lucene <code>StandardAnalyzer</code> is used.</p>
 <p>As of Jena 4.7.x / Lucene 9.x onwards, the <code>StandardAnalyzer</code> does not default to having
 English stopwords if no stop words are provided. The setting up until
 Apache Lucene 8 had the stopwords:</p>
 <pre>
       "a"  "an"  "and"  "are"  "as"  "at"  "be"  "but"  "by"  "for"  "if"  "in"  "into"  "is"
       "it"  "no"  "not"  "of"  "on"  "or"  "such"  "that"  "the"  "their"  "then"  "there"
       "these"  "they"  "this"  "to"  "was"  "will"  "with"
 </pre>
 <p>In case of a <code>TextIndexLucene</code> the default analyzer can be replaced by another analyzer with
 the <code>text:analyzer</code> property on the <code>text:TextIndexLucene</code> resource in the
 <a href="#text-dataset-assembler">text dataset assembler</a>,  for example with a <code>SimpleAnalyzer</code>:</p>
 <pre><code>&lt;#indexLucene&gt; a text:TextIndexLucene ;
         text:directory &lt;file:Lucene&gt; ;
         text:analyzer [
             a text:SimpleAnalyzer
         ]
         .
 </code></pre>
 <p>It is possible to configure an alternative analyzer for each field indexed in a
 Lucene index.  For example:</p>
 <pre><code>&lt;#entMap&gt; a text:EntityMap ;
     text:entityField      &quot;uri&quot; ;
     text:defaultField     &quot;text&quot; ;
     text:map (
          [ text:field &quot;text&quot; ;
            text:predicate rdfs:label ;
            text:analyzer [
                a text:StandardAnalyzer ;
                text:stopWords (&quot;a&quot; &quot;an&quot; &quot;and&quot; &quot;but&quot;)
            ]
          ]
          ) .
 </code></pre>
 <p>will configure the index to analyze values of the &rsquo;text&rsquo; field
 using a <code>StandardAnalyzer</code> with the given list of stop words.</p>
 <p>Other analyzer types that may be specified are <code>SimpleAnalyzer</code> and
 <code>KeywordAnalyzer</code>, neither of which has any configuration parameters. See
 the Lucene documentation for details of what these analyzers do. Jena also
 provides <code>LowerCaseKeywordAnalyzer</code>, which is a case-insensitive version of
 <code>KeywordAnalyzer</code>, and <a href="#configurableanalyzer"><code>ConfigurableAnalyzer</code></a>.</p>
 <p>Support for the new <code>LocalizedAnalyzer</code> has been introduced in Jena 3.0.0 to
 deal with Lucene language specific analyzers. See <a href="#linguistic-support-with-lucene-index">Linguistic Support with
 Lucene Index</a> for details.</p>
 <p>Support for <code>GenericAnalyzer</code>s has been introduced in Jena 3.4.0 to allow
 the use of Analyzers that do not have built-in support, e.g., <code>BrazilianAnalyzer</code>;
 require constructor parameters not otherwise supported, e.g., a stop words <code>FileReader</code> or
 a <code>stemExclusionSet</code>; and finally use of Analyzers not included in the bundled
 Lucene distribution, e.g., a <code>SanskritIASTAnalyzer</code>. See <a href="#generic-and-defined-analyzer-support">Generic and Defined
 Analyzer Support</a></p>
 <h4 id="configurableanalyzer">ConfigurableAnalyzer</h4>
 <p><code>ConfigurableAnalyzer</code> was introduced in Jena 3.0.1. It allows more detailed
 configuration of text analysis parameters by independently selecting a
 <code>Tokenizer</code> and zero or more <code>TokenFilter</code>s which are applied in order after
 tokenization. See the Lucene documentation for details on what each
 tokenizer and token filter does.</p>
 <p>The available <code>Tokenizer</code> implementations are:</p>
 <ul>
 <li><code>StandardTokenizer</code></li>
 <li><code>KeywordTokenizer</code></li>
 <li><code>WhitespaceTokenizer</code></li>
 <li><code>LetterTokenizer</code></li>
 </ul>
 <p>The available <code>TokenFilter</code> implementations are:</p>
 <ul>
 <li><code>StandardFilter</code></li>
 <li><code>LowerCaseFilter</code></li>
 <li><code>ASCIIFoldingFilter</code></li>
 <li><code>SelectiveFoldingFilter</code></li>
 </ul>
 <p>Configuration is done using Jena assembler like this:</p>
 <pre><code>text:analyzer [
   a text:ConfigurableAnalyzer ;
   text:tokenizer text:KeywordTokenizer ;
   text:filters (text:ASCIIFoldingFilter, text:LowerCaseFilter)
 ]
 </code></pre>
 <p>From Jena 3.7.0, it is possible to define tokenizers and filters in addition to the <em>built-in</em>
 choices above that may be used with the <code>ConfigurableAnalyzer</code>. Tokenizers and filters are
 defined via <code>text:defineAnalyzers</code> in the <code>text:TextIndexLucene</code> assembler section
 using <a href="#generic-analyzers-tokenizers-and-filters"><code>text:GenericTokenizer</code> and <code>text:GenericFilter</code></a>.</p>
 <h4 id="analyzer-for-query">Analyzer for Query</h4>
 <p>New in Jena 2.13.0.</p>
 <p>There is an ability to specify an analyzer to be used for the query
 string itself.  It will find terms in the query text.  If not set, then
 the analyzer used for the document will be used.  The query analyzer is
 specified on the <code>TextIndexLucene</code> resource:</p>
 <pre><code>&lt;#indexLucene&gt; a text:TextIndexLucene ;
     text:directory &lt;file:Lucene&gt; ;
     text:entityMap &lt;#entMap&gt; ;
     text:queryAnalyzer [
         a text:KeywordAnalyzer
     ]
     .
 </code></pre>
 <h4 id="alternative-query-parsers">Alternative Query Parsers</h4>
 <p>New in Jena 3.1.0.</p>
 <p>It is possible to select a query parser other than the default QueryParser.</p>
 <p>The available <code>QueryParser</code> implementations are:</p>
 <ul>
 <li>
 <p><code>AnalyzingQueryParser</code>: Performs analysis for wildcard queries . This
 is useful in combination with accent-insensitive wildcard queries.</p>
 </li>
 <li>
 <p><code>ComplexPhraseQueryParser</code>: Permits complex phrase query syntax. Eg:
 &ldquo;(john jon jonathan~) peters*&rdquo;.  This is useful for performing wildcard
 or fuzzy queries on individual terms in a phrase.</p>
 </li>
 <li>
 <p><code>SurroundQueryParser</code>: Provides positional operators (w and n)
 that accept a numeric distance, as well as boolean
 operators (and, or, and not, wildcards (* and ?), quoting (with &ldquo;),
 and boosting (via ^).</p>
 </li>
 </ul>
 <p>The query parser is specified on
 the <code>TextIndexLucene</code> resource:</p>
 <pre><code>&lt;#indexLucene&gt; a text:TextIndexLucene ;
     text:directory &lt;file:Lucene&gt; ;
     text:entityMap &lt;#entMap&gt; ;
     text:queryParser text:AnalyzingQueryParser .
 </code></pre>
 <p>Elasticsearch currently doesn&rsquo;t support Analyzers beyond Standard Analyzer.</p>
 <h3 id="configuration-by-code">Configuration by Code</h3>
 <p>A text dataset can also be constructed in code as might be done for a
 purely in-memory setup:</p>
 <pre><code>    // Example of building a text dataset with code.
     // Example is in-memory.
     // Base dataset
     Dataset ds1 = DatasetFactory.createMem() ;

     EntityDefinition entDef = new EntityDefinition(&quot;uri&quot;, &quot;text&quot;, RDFS.label) ;

     // Lucene, in memory.
     Directory dir =  new RAMDirectory();

     // Join together into a dataset
     Dataset ds = TextDatasetFactory.createLucene(ds1, dir, entDef) ;
 </code></pre>
 <h3 id="graph-specific-indexing">Graph-specific Indexing</h3>
 <p>jena-text supports storing information about the source graph into the
 text index. This allows for more efficient text queries when the query
 targets only a single named graph. Without graph-specific indexing, text
 queries do not distinguish named graphs and will always return results
 from all graphs.</p>
 <p>Support for graph-specific indexing is enabled by defining the name of the
 index field to use for storing the graph identifier.</p>
 <p>If you use an assembler configuration, set the graph field using the
 text:graphField property on the EntityMap, e.g.</p>
 <pre><code># Mapping in the index
 # URI stored in field &quot;uri&quot;
 # Graph stored in field &quot;graph&quot;
 # rdfs:label is mapped to field &quot;text&quot;
 &lt;#entMap&gt; a text:EntityMap ;
     text:entityField      &quot;uri&quot; ;
     text:graphField       &quot;graph&quot; ;
     text:defaultField     &quot;text&quot; ;
     text:map (
          [ text:field &quot;text&quot; ; text:predicate rdfs:label ]
          ) .
 </code></pre>
 <p>If you configure the index in Java code, you need to use one of the
 EntityDefinition constructors that support the graphField parameter, e.g.</p>
 <pre><code>    EntityDefinition entDef = new EntityDefinition(&quot;uri&quot;, &quot;text&quot;, &quot;graph&quot;, RDFS.label.asNode()) ;
 </code></pre>
 <p><strong>Note:</strong> If you migrate from a global (non-graph-aware) index to a graph-aware index,
 you need to rebuild the index to ensure that the graph information is stored.</p>
 <h3 id="linguistic-support-with-lucene-index">Linguistic support with Lucene index</h3>
 <p>Language tags associated with <code>rdfs:langStrings</code> occurring as literals in triples may
 be used to enhance indexing and queries. Sub-sections below detail different settings with the index, and use cases with SPARQL queries.</p>
 <h4 id="explicit-language-field-in-the-index">Explicit Language Field in the Index</h4>
 <p>The language tag for object literals of triples can be stored (during triple insert/update)
 into the index to extend query capabilities.
 For that, the <code>text:langField</code> property must be set in the EntityMap assembler :</p>
 <pre><code>&lt;#entMap&gt; a text:EntityMap ;
     text:entityField      &quot;uri&quot; ;
     text:defaultField     &quot;text&quot; ;
     text:langField        &quot;lang&quot; ;
     .
 </code></pre>
 <p>If you configure the index via Java code, you need to set this parameter to the
 EntityDefinition instance, e.g.</p>
 <pre><code>EntityDefinition docDef = new EntityDefinition(entityField, defaultField);
 docDef.setLangField(&quot;lang&quot;);
 </code></pre>
 <p>Note that configuring the <code>text:langField</code> does not determine a language specific
 analyzer. It merely records the tag associated with an indexed <code>rdfs:langString</code>.</p>
 <h4 id="sparql-linguistic-clause-forms">SPARQL Linguistic Clause Forms</h4>
 <p>Once the <code>langField</code> is set, you can use it directly inside SPARQL queries. For that the <code>lang:xx</code>
 argument allows you to target specific localized values. For example:</p>
 <pre><code>//target english literals
 ?s text:query (rdfs:label 'word' 'lang:en' )

 //target unlocalized literals
 ?s text:query (rdfs:label 'word' 'lang:none')

 //ignore language field
 ?s text:query (rdfs:label 'word')
 </code></pre>
 <p>Refer <a href="#queries-with-language-tags">above</a> for further discussion on querying.</p>
 <h4 id="localizedanalyzer">LocalizedAnalyzer</h4>
 <p>You can specify a LocalizedAnalyzer in order to benefit from Lucene language
 specific analyzers (stemming, stop words,&hellip;). Like any other analyzers, it can
 be done for default text indexing, for each different field or for query.</p>
 <p>Using an assembler configuration, the <code>text:language</code> property needs to
 be provided, e.g :</p>
 <pre><code>&lt;#indexLucene&gt; a text:TextIndexLucene ;
     text:directory &lt;file:Lucene&gt; ;
     text:entityMap &lt;#entMap&gt; ;
     text:analyzer [
         a text:LocalizedAnalyzer ;
         text:language &quot;fr&quot;
     ]
     .
 </code></pre>
 <p>will configure the index to analyze values of the <em>default property</em> field using a
 FrenchAnalyzer.</p>
 <p>To configure the same example via Java code, you need to provide the analyzer to the
 index configuration object:</p>
 <pre><code>    TextIndexConfig config = new TextIndexConfig(def);
     Analyzer analyzer = Util.getLocalizedAnalyzer(&quot;fr&quot;);
     config.setAnalyzer(analyzer);
     Dataset ds = TextDatasetFactory.createLucene(ds1, dir, config) ;
 </code></pre>
 <p>Where <code>def</code>, <code>ds1</code> and <code>dir</code> are instances of <code>EntityDefinition</code>, <code>Dataset</code> and
 <code>Directory</code> classes.</p>
 <p><strong>Note</strong>: You do not have to set the <code>text:langField</code> property with a single
 localized analyzer. Also note that the above configuration will use the
 FrenchAnalyzer for all strings indexed under the <em>default property</em> regardless
 of the language tag associated with the literal (if any).</p>
 <h4 id="multilingual-support">Multilingual Support</h4>
 <p>Let us suppose that we have many triples with many localized literals in
 many different languages. It is possible to take all these languages
 into account for future mixed localized queries.  Configure the
 <code>text:multilingualSupport</code> property to enable indexing and search via localized
 analyzers based on the language tag:</p>
 <pre><code>&lt;#indexLucene&gt; a text:TextIndexLucene ;
     text:directory &quot;mem&quot; ;
     text:multilingualSupport true;
     .
 </code></pre>
 <p>Via Java code, set the multilingual support flag :</p>
 <pre><code>    TextIndexConfig config = new TextIndexConfig(def);
     config.setMultilingualSupport(true);
     Dataset ds = TextDatasetFactory.createLucene(ds1, dir, config) ;
 </code></pre>
 <p>This multilingual index combines dynamically all localized analyzers of existing
 languages and the storage of langField properties.</p>
 <p>The multilingual analyzer becomes the <em>default analyzer</em> and the Lucene
 <code>StandardAnalyzer</code> is the default analyzer used when there is no language tag.</p>
 <p>It is straightforward to refer to different languages in the same text search query:</p>
 <pre><code>SELECT ?s
 WHERE {
     { ?s text:query ( rdfs:label 'institut' 'lang:fr' ) }
     UNION
     { ?s text:query ( rdfs:label 'institute' 'lang:en' ) }
 }
 </code></pre>
 <p>Hence, the result set of the query will contain &ldquo;institute&rdquo; related
 subjects (institution, institutional,&hellip;) in French and in English.</p>
 <p><strong>Note</strong> When multilingual indexing is enabled for a <em>property</em>, e.g., rdfs:label,
 there will actually be two copies of each literal indexed. One under the <code>Field</code> name,
 &ldquo;label&rdquo;, and one under the name &ldquo;label_xx&rdquo;, where &ldquo;xx&rdquo; is the language tag.</p>
 <h3 id="generic-and-defined-analyzer-support">Generic and Defined Analyzer Support</h3>
 <p>There are many Analyzers that do not have built-in support, e.g.,
 <code>BrazilianAnalyzer</code>; require constructor parameters not otherwise
 supported, e.g., a stop words <code>FileReader</code> or a <code>stemExclusionSet</code>; or
 make use of Analyzers not included in the bundled Lucene distribution,
 e.g., a <code>SanskritIASTAnalyzer</code>. Two features have been added to enhance
 the utility of jena-text: 1) <code>text:GenericAnalyzer</code>; and 2)
 <code>text:DefinedAnalyzer</code>. Further, since Jena 3.7.0, features to allow definition of
 tokenizers and filters are included.</p>
 <h4 id="generic-analyzers-tokenizers-and-filters">Generic Analyzers, Tokenizers and Filters</h4>
 <p>A <code>text:GenericAnalyzer</code> includes a <code>text:class</code> which is the fully
 qualified class name of an Analyzer that is accessible on the jena
 classpath. This is trivial for Analyzer classes that are included in the
 bundled Lucene distribution and for other custom Analyzers a simple
 matter of including a jar containing the custom Analyzer and any
 associated Tokenizer and Filters on the classpath.</p>
 <p>Similarly, <code>text:GenericTokenizer</code> and <code>text:GenericFilter</code> allow to access any tokenizers
 or filters that are available on the Jena classpath. These two types are used <em>only</em> to define
 tokenizer and filter configurations that may be referred to when specifying a
 <a href="#configurableanalyzer">ConfigurableAnalyzer</a>.</p>
 <p>In addition to the <code>text:class</code> it is generally useful to include an
 ordered list of <code>text:params</code> that will be used to select an appropriate
 constructor of the Analyzer class. If there are no <code>text:params</code> in the
 analyzer specification or if the <code>text:params</code> is an empty list then the
 nullary constructor is used to instantiate the analyzer. Each element of
 the list of <code>text:params</code> includes:</p>
 <ul>
 <li>an optional <code>text:paramName</code> of type <code>Literal</code> that is useful to identify the purpose of a
 parameter in the assembler configuration</li>
 <li>a <code>text:paramType</code> which is one of:</li>
 </ul>
 <table>
   <thead>
       <tr>
           <th> Type </th>
           <th>  Description </th>
       </tr>
   </thead>
   <tbody>
       <tr>
           <td><code>text:TypeAnalyzer</code></td>
           <td>a subclass of <code>org.apache.lucene.analysis.Analyzer</code></td>
       </tr>
       <tr>
           <td><code>text:TypeBoolean</code></td>
           <td>a java <code>boolean</code></td>
       </tr>
       <tr>
           <td><code>text:TypeFile</code></td>
           <td>the <code>String</code> path to a file materialized as a <code>java.io.FileReader</code></td>
       </tr>
       <tr>
           <td><code>text:TypeInt</code></td>
           <td>a java <code>int</code></td>
       </tr>
       <tr>
           <td><code>text:TypeString</code></td>
           <td>a java <code>String</code></td>
       </tr>
       <tr>
           <td><code>text:TypeSet</code></td>
           <td>an <code>org.apache.lucene.analysis.CharArraySet</code></td>
       </tr>
   </tbody>
 </table>
 <p>and is required for the types <code>text:TypeAnalyzer</code>, <code>text:TypeFile</code> and <code>text:TypeSet</code>, but,
 since Jena 3.7.0, may be implied by the form of the literal for the types: <code>text:TypeBoolean</code>,
 <code>text:TypeInt</code> and <code>text:TypeString</code>.</p>
 <ul>
 <li>a required <code>text:paramValue</code> with an object of the type corresponding to <code>text:paramType</code></li>
 </ul>
 <p>In the case of an <code>analyzer</code> parameter the <code>text:paramValue</code> is any <code>text:analyzer</code> resource as
 describe throughout this document.</p>
 <p>An example of the use of <code>text:GenericAnalyzer</code> to configure an <code>EnglishAnalyzer</code> with stop
 words and stem exclusions is:</p>
 <pre><code>text:map (
      [ text:field &quot;text&quot; ;
        text:predicate rdfs:label;
        text:analyzer [
            a text:GenericAnalyzer ;
            text:class &quot;org.apache.lucene.analysis.en.EnglishAnalyzer&quot; ;
            text:params (
                 [ text:paramName &quot;stopwords&quot; ;
                   text:paramType text:TypeSet ;
                   text:paramValue (&quot;the&quot; &quot;a&quot; &quot;an&quot;) ]
                 [ text:paramName &quot;stemExclusionSet&quot; ;
                   text:paramType text:TypeSet ;
                   text:paramValue (&quot;ing&quot; &quot;ed&quot;) ]
                 )
        ] .
 </code></pre>
 <p>Here is an example of defining an instance of <code>ShingleAnalyzerWrapper</code>:</p>
 <pre><code>text:map (
      [ text:field &quot;text&quot; ;
        text:predicate rdfs:label;
        text:analyzer [
            a text:GenericAnalyzer ;
            text:class &quot;org.apache.lucene.analysis.shingle.ShingleAnalyzerWrapper&quot; ;
            text:params (
                 [ text:paramName &quot;defaultAnalyzer&quot; ;
                   text:paramType text:TypeAnalyzer ;
                   text:paramValue [ a text:SimpleAnalyzer ] ]
                 [ text:paramName &quot;maxShingleSize&quot; ;
                   text:paramType text:TypeInt ;
                   text:paramValue 3 ]
                 )
        ] .
 </code></pre>
 <p>If there is need of using an analyzer with constructor parameter types not included here then
 one approach is to define an <code>AnalyzerWrapper</code> that uses available parameter types, such as
 <code>file</code>, to collect the information needed to instantiate the desired analyzer. An example of
 such an analyzer is the Kuromoji morphological analyzer for Japanese text that uses constructor
 parameters of types: <code>UserDictionary</code>, <code>JapaneseTokenizer.Mode</code>, <code>CharArraySet</code> and <code>Set&lt;String&gt;</code>.</p>
 <p>As mentioned above, the simple types: <code>TypeInt</code>, <code>TypeBoolean</code>, and <code>TypeString</code> may be written
 without explicitly including <code>text:paramType</code> in the parameter specification. For example:</p>
 <pre><code>                [ text:paramName &quot;maxShingleSize&quot; ;
                   text:paramValue 3 ]
 </code></pre>
 <p>is sufficient to specify the parameter.</p>
 <h4 id="defined-analyzers">Defined Analyzers</h4>
 <p>The <code>text:defineAnalyzers</code> feature allows to extend the <a href="#multilingual-support">Multilingual Support</a>
 defined above. Further, this feature can also be used to name analyzers defined via <code>text:GenericAnalyzer</code>
 so that a single (perhaps complex) analyzer configuration can be used is several places.</p>
 <p>Further, since Jena 3.7.0, this feature is also used to name tokenizers and filters that
 can be referred to in the specification of a <code>ConfigurableAnalyzer</code>.</p>
 <p>The <code>text:defineAnalyzers</code> is used with <code>text:TextIndexLucene</code> to provide a list of analyzer
 definitions:</p>
 <pre><code>&lt;#indexLucene&gt; a text:TextIndexLucene ;
     text:directory &lt;file:Lucene&gt; ;
     text:entityMap &lt;#entMap&gt; ;
     text:defineAnalyzers (
         [ text:addLang &quot;sa-x-iast&quot; ;
           text:analyzer [ . . . ] ]
         [ text:defineAnalyzer &lt;#foo&gt; ;
           text:analyzer [ . . . ] ]
     )
     .
 </code></pre>
 <p>References to a defined analyzer may be made in the entity map like:</p>
 <pre><code>text:analyzer [
     a text:DefinedAnalyzer
     text:useAnalyzer &lt;#foo&gt; ]
 </code></pre>
 <p>Since Jena 3.7.0, a <code>ConfigurableAnalyzer</code> specification can refer to any defined tokenizer
 and filters, as in:</p>
 <pre><code>text:defineAnalyzers (
      [ text:defineAnalyzer :configuredAnalyzer ;
        text:analyzer [
             a text:ConfigurableAnalyzer ;
             text:tokenizer :ngram ;
             text:filters ( :asciiff text:LowerCaseFilter ) ] ]
      [ text:defineTokenizer :ngram ;
        text:tokenizer [
             a text:GenericTokenizer ;
             text:class &quot;org.apache.lucene.analysis.ngram.NGramTokenizer&quot; ;
             text:params (
                  [ text:paramName &quot;minGram&quot; ;
                    text:paramValue 3 ]
                  [ text:paramName &quot;maxGram&quot; ;
                    text:paramValue 7 ]
                  ) ] ]
      [ text:defineFilter :asciiff ;
        text:filter [
             a text:GenericFilter ;
             text:class &quot;org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter&quot; ;
             text:params (
                  [ text:paramName &quot;preserveOriginal&quot; ;
                    text:paramValue true ]
                  ) ] ]
      ) ;
 </code></pre>
 <p>And after 3.8.0 users are able to use the JenaText custom filter <code>SelectiveFoldingFilter</code>.
 This filter is not part of the Apache Lucene, but rather a custom implementation available
 for JenaText users.</p>
 <p>It is based on the Apache Lucene&rsquo;s <code>ASCIIFoldingFilter</code>, but with the addition of a
 white-list for characters that must not be replaced. This is especially useful for languages
 where some special characters and diacritical marks are useful when searching.</p>
 <p>Here&rsquo;s an example:</p>
 <pre><code>text:defineAnalyzers (
      [ text:defineAnalyzer :configuredAnalyzer ;
        text:analyzer [
             a text:ConfigurableAnalyzer ;
             text:tokenizer :tokenizer ;
             text:filters ( :selectiveFoldingFilter text:LowerCaseFilter ) ] ]
      [ text:defineTokenizer :tokenizer ;
        text:tokenizer [
             a text:GenericTokenizer ;
             text:class &quot;org.apache.lucene.analysis.core.LowerCaseTokenizer&quot; ] ]
      [ text:defineFilter :selectiveFoldingFilter ;
        text:filter [
             a text:GenericFilter ;
             text:class &quot;org.apache.jena.query.text.filter.SelectiveFoldingFilter&quot; ;
             text:params (
                  [ text:paramName &quot;whitelisted&quot; ;
                    text:paramType text:TypeSet ;
                    text:paramValue (&quot;ç&quot; &quot;ä&quot;) ]
                  ) ] ]
      ) ;
 </code></pre>
 <h4 id="extending-multilingual-support">Extending multilingual support</h4>
 <p>The <a href="#multilingual-support">Multilingual Support</a> described above allows for a limited set of
 ISO 2-letter codes to be used to select from among built-in analyzers using the nullary constructor
 associated with each analyzer. So if one is wanting to use:</p>
 <ul>
 <li>a language not included, e.g., Brazilian; or</li>
 <li>use additional constructors defining stop words, stem exclusions and so on; or</li>
 <li>refer to custom analyzers that might be associated with generalized BCP-47 language tags,
 such as, <code>sa-x-iast</code> for Sanskrit in the IAST transliteration,</li>
 </ul>
 <p>then <code>text:defineAnalyzers</code> with <code>text:addLang</code> will add the desired analyzers to the
 multilingual support so that fields with the appropriate language tags will use the appropriate
 custom analyzer.</p>
 <p>When <code>text:defineAnalyzers</code> is used with <code>text:addLang</code> then <code>text:multilingualSupport</code> is
 implicitly added if not already specified and a warning is put in the log:</p>
 <pre><code>    text:defineAnalyzers (
         [ text:addLang &quot;sa-x-iast&quot; ;
           text:analyzer [ . . . ] ]
 </code></pre>
 <p>this adds an analyzer to be used when the <code>text:langField</code> has the value <code>sa-x-iast</code> during
 indexing and search.</p>
 <h4 id="multilingual-enhancements-for-multi-encoding-searches">Multilingual enhancements for multi-encoding searches</h4>
 <p>There are two multilingual search situations that are supported as of 3.8.0:</p>
 <ul>
 <li>Search in one encoding and retrieve results that may have been entered in other encodings. For example, searching via Simplified Chinese (Hans) and retrieving results that may have been entered in Traditional Chinese (Hant) or Pinyin. This will simplify applications by permitting encoding independent retrieval without additional layers of transcoding and so on. It&rsquo;s all done under the covers in Lucene.</li>
 <li>Search with queries entered in a lossy, e.g., phonetic, encoding and retrieve results entered with accurate encoding. For example, searching via Pinyin without diacritics and retrieving all possible Hans and Hant triples.</li>
 </ul>
 <p>The first situation arises when entering triples that include languages with multiple encodings that for various reasons are not normalized to a single encoding. In this situation it is helpful to be able to retrieve appropriate result sets without regard for the encodings used at the time that the triples were inserted into the dataset.</p>
 <p>There are several suchlanguages of interest: Chinese, Tibetan, Sanskrit, Japanese and Korean. There are various Romanizations and ideographic variants.</p>
 <p>Encodings may not be normalized when inserting triples for a variety of reasons. A principle one is that the <code>rdf:langString</code> object often must be entered in the same encoding that it occurs in some physical text that is being catalogued. Another is that metadata may be imported from sources that use different encoding conventions and it is desirable to preserve the original form.</p>
 <p>The second situation arises to provide simple support for phonetic or other forms of lossy search at the time that triples are indexed directly in the Lucene system.</p>
 <p>To handle the first situation a <code>text</code> assembler predicate, <code>text:searchFor</code>, is introduced that specifies a list of language tags that provides a list of language variants that should be searched whenever a query string of a given encoding (language tag) is used. For example, the following <code>text:defineAnalyzers</code> fragment :</p>
 <pre><code>    [ text:addLang &quot;bo&quot; ;
       text:searchFor ( &quot;bo&quot; &quot;bo-x-ewts&quot; &quot;bo-alalc97&quot; ) ;
       text:analyzer [
         a text:GenericAnalyzer ;
         text:class &quot;io.bdrc.lucene.bo.TibetanAnalyzer&quot; ;
         text:params (
             [ text:paramName &quot;segmentInWords&quot; ;
               text:paramValue false ]
             [ text:paramName &quot;lemmatize&quot; ;
               text:paramValue true ]
             [ text:paramName &quot;filterChars&quot; ;
               text:paramValue false ]
             [ text:paramName &quot;inputMode&quot; ;
               text:paramValue &quot;unicode&quot; ]
             [ text:paramName &quot;stopFilename&quot; ;
               text:paramValue &quot;&quot; ]
             )
         ] ;
       ]
 </code></pre>
 <p>indicates that when using a search string such as &ldquo;རྡོ་རྗེ་སྙིང་&quot;@bo the Lucene index should also be searched for matches tagged as <code>bo-x-ewts</code> and <code>bo-alalc97</code>.</p>
 <p>This is made possible by a Tibetan <code>Analyzer</code> that tokenizes strings in all three encodings into Tibetan Unicode. This is feasible since the <code>bo-x-ewts</code> and <code>bo-alalc97</code> encodings are one-to-one with Unicode Tibetan. Since all fields with these language tags will have a common set of indexed terms, i.e., Tibetan Unicode, it suffices to arrange for the query analyzer to have access to the language tag for the query string along with the various fields that need to be considered.</p>
 <p>Supposing that the query is:</p>
 <pre><code>(?s ?sc ?lit) text:query (&quot;rje&quot;@bo-x-ewts)
 </code></pre>
 <p>Then the query formed in <code>TextIndexLucene</code> will be:</p>
 <pre><code>label_bo:rje label_bo-x-ewts:rje label_bo-alalc97:rje
 </code></pre>
 <p>which is translated using a suitable <code>Analyzer</code>, <code>QueryMultilingualAnalyzer</code>, via Lucene&rsquo;s <code>QueryParser</code> to:</p>
 <pre><code>+(label_bo:རྗེ label_bo-x-ewts:རྗེ label_bo-alalc97:རྗེ)
 </code></pre>
 <p>which reflects the underlying Tibetan Unicode term encoding. During <code>IndexSearcher.search</code> all documents with one of the three fields in the index for term, &ldquo;རྗེ&rdquo;, will be returned even though the value in the fields <code>label_bo-x-ewts</code> and <code>label_bo-alalc97</code> for the returned documents will be the original value &ldquo;rje&rdquo;.</p>
 <p>This support simplifies applications by permitting encoding independent retrieval without additional layers of transcoding and so on. It&rsquo;s all done under the covers in Lucene.</p>
 <p>Solving the second situation simplifies applications by adding appropriate fields and indexing via configuration in the <code>text:defineAnalyzers</code>. For example, the following fragment:</p>
 <pre><code>    [ text:defineAnalyzer :hanzAnalyzer ;
       text:analyzer [
         a text:GenericAnalyzer ;
         text:class &quot;io.bdrc.lucene.zh.ChineseAnalyzer&quot; ;
         text:params (
             [ text:paramName &quot;profile&quot; ;
               text:paramValue &quot;TC2SC&quot; ]
             [ text:paramName &quot;stopwords&quot; ;
               text:paramValue false ]
             [ text:paramName &quot;filterChars&quot; ;
               text:paramValue 0 ]
             )
         ] ;
       ]
     [ text:defineAnalyzer :han2pinyin ;
       text:analyzer [
         a text:GenericAnalyzer ;
         text:class &quot;io.bdrc.lucene.zh.ChineseAnalyzer&quot; ;
         text:params (
             [ text:paramName &quot;profile&quot; ;
               text:paramValue &quot;TC2PYstrict&quot; ]
             [ text:paramName &quot;stopwords&quot; ;
               text:paramValue false ]
             [ text:paramName &quot;filterChars&quot; ;
               text:paramValue 0 ]
             )
         ] ;
       ]
     [ text:defineAnalyzer :pinyin ;
       text:analyzer [
         a text:GenericAnalyzer ;
         text:class &quot;io.bdrc.lucene.zh.ChineseAnalyzer&quot; ;
         text:params (
             [ text:paramName &quot;profile&quot; ;
               text:paramValue &quot;PYstrict&quot; ]
             )
         ] ;
       ]
     [ text:addLang &quot;zh-hans&quot; ;
       text:searchFor ( &quot;zh-hans&quot; &quot;zh-hant&quot; ) ;
       text:auxIndex ( &quot;zh-aux-han2pinyin&quot; ) ;
       text:analyzer [
         a text:DefinedAnalyzer ;
         text:useAnalyzer :hanzAnalyzer ] ;
       ]
     [ text:addLang &quot;zh-hant&quot; ;
       text:searchFor ( &quot;zh-hans&quot; &quot;zh-hant&quot; ) ;
       text:auxIndex ( &quot;zh-aux-han2pinyin&quot; ) ;
       text:analyzer [
         a text:DefinedAnalyzer ;
         text:useAnalyzer :hanzAnalyzer ] ;
       ]
     [ text:addLang &quot;zh-latn-pinyin&quot; ;
       text:searchFor ( &quot;zh-latn-pinyin&quot; &quot;zh-aux-han2pinyin&quot; ) ;
       text:analyzer [
         a text:DefinedAnalyzer ;
         text:useAnalyzer :pinyin ] ;
       ]
     [ text:addLang &quot;zh-aux-han2pinyin&quot; ;
       text:searchFor ( &quot;zh-latn-pinyin&quot; &quot;zh-aux-han2pinyin&quot; ) ;
       text:analyzer [
         a text:DefinedAnalyzer ;
         text:useAnalyzer :pinyin ] ;
       text:indexAnalyzer :han2pinyin ;
       ]
 </code></pre>
 <p>defines language tags for Traditional, Simplified, Pinyin and an <em>auxiliary</em> tag <code>zh-aux-han2pinyin</code> associated with an <code>Analyzer</code>, <code>:han2pinyin</code>. The purpose of the auxiliary tag is to define an <code>Analyzer</code> that will be used during indexing and to specify a list of tags that should be searched when the auxiliary tag is used with a query string.</p>
 <p>Searching is then done via the multi-encoding support discussed above. In this example the <code>Analyzer</code>, <code>:han2pinyin</code>, tokenizes strings in <code>zh-hans</code> and <code>zh-hant</code> as the corresponding pinyin so that at search time a pinyin query will retrieve appropriate triples inserted in Traditional or Simplified Chinese. Such a query would appear as:</p>
 <pre><code>(?s ?sc ?lit ?g) text:query (&quot;jīng&quot;@zh-aux-han2pinyin)
 </code></pre>
 <p>The auxiliary field support is needed to accommodate situations such as pinyin or sound-ex which are not exact, i.e., one-to-many rather than one-to-one as in the case of Simplified and Traditional.</p>
 <p><code>TextIndexLucene</code> adds a field for each of the auxiliary tags associated with the tag of the triple object being indexed. These fields are in addition to the un-tagged field and the field tagged with the language of the triple object literal.</p>
 <h4 id="naming-analyzers-for-later-use">Naming analyzers for later use</h4>
 <p>Repeating a <code>text:GenericAnalyzer</code> specification for use with multiple fields in an entity map
 may be cumbersome. The <code>text:defineAnalyzer</code> is used in an element of a <code>text:defineAnalyzers</code>
 list to associate a resource with an analyzer so that it may be referred to later in a
 <code>text:analyzer</code> object. Assuming that an analyzer definition such as the following has appeared
 among the <code>text:defineAnalyzers</code> list:</p>
 <pre><code>[ text:defineAnalyzer &lt;#foo&gt;
   text:analyzer [ . . . ] ]
 </code></pre>
 <p>then in a <code>text:analyzer</code> specification in an entity map, for example, a reference to analyzer <code>&lt;#foo&gt;</code>
 is made via:</p>
 <pre><code>text:map (
      [ text:field &quot;text&quot; ;
        text:predicate rdfs:label;
        text:analyzer [
            a text:DefinedAnalyzer
            text:useAnalyzer &lt;#foo&gt; ]
 </code></pre>
 <p>This makes it straightforward to refer to the same (possibly complex) analyzer definition in multiple fields.</p>
 <h3 id="storing-literal-values">Storing Literal Values</h3>
 <p>New in Jena 3.0.0.</p>
 <p>It is possible to configure the text index to store enough information in the
 text index to be able to access the original indexed literal values at query time.
 This is controlled by two configuration options. First, the <code>text:storeValues</code> property
 must be set to <code>true</code> for the text index:</p>
 <pre><code>&lt;#indexLucene&gt; a text:TextIndexLucene ;
     text:directory &quot;mem&quot; ;
     text:storeValues true;
     .
 </code></pre>
 <p>Or using Java code, used the <code>setValueStored</code> method of <code>TextIndexConfig</code>:</p>
 <pre><code>    TextIndexConfig config = new TextIndexConfig(def);
     config.setValueStored(true);
 </code></pre>
 <p>Additionally, setting the <code>langField</code> configuration option is recommended. See
 <a href="#linguistic-support-with-lucene-index">Linguistic Support with Lucene Index</a>
 for details. Without the <code>langField</code> setting, the stored literals will not have
 language tag or datatype information.</p>
 <p>At query time, the stored literals can be accessed by using a 3-element list
 of variables as the subject of the <code>text:query</code> property function. The literal
 value will be bound to the third variable:</p>
 <pre><code>(?s ?score ?literal) text:query 'word'
 </code></pre>
 <h2 id="working-with-fuseki">Working with Fuseki</h2>
 <p>The Fuseki configuration simply points to the text dataset as the
 <code>fuseki:dataset</code> of the service.</p>
 <pre><code>&lt;#service_text_tdb&gt; rdf:type fuseki:Service ;
     rdfs:label                      &quot;TDB/text service&quot; ;
     fuseki:name                     &quot;ds&quot; ;
     fuseki:serviceQuery             &quot;query&quot; ;
     fuseki:serviceQuery             &quot;sparql&quot; ;
     fuseki:serviceUpdate            &quot;update&quot; ;
     fuseki:serviceReadGraphStore    &quot;get&quot; ;
     fuseki:serviceReadWriteGraphStore    &quot;data&quot; ;
     fuseki:dataset                  :text_dataset ;
     .
 </code></pre>
 <h2 id="building-a-text-index">Building a Text Index</h2>
 <p>When working at scale, or when preparing a published, read-only, SPARQL
 service, creating the index by loading the text dataset is impractical.<br>
 The index and the dataset can be built using command line tools in two
 steps: first load the RDF data, second create an index from the existing
 RDF dataset.</p>
 <h3 id="step-1---building-a-tdb-dataset">Step 1 - Building a TDB dataset</h3>
 <p><strong>Note:</strong> If you have an existing TDB dataset then you can skip this step</p>
 <p>Build the TDB dataset:</p>
 <pre><code>java -cp $FUSEKI_HOME/fuseki-server.jar tdb.tdbloader --tdb=assembler_file data_file
 </code></pre>
 <p>using the copy of TDB included with Fuseki.</p>
 <p>Alternatively, use one of the
 <a href="../tdb/commands.html">TDB utilities</a> <code>tdbloader</code> or <code>tdbloader2</code> which are better for bulk loading:</p>
 <pre><code>$JENA_HOME/bin/tdbloader --loc=directory  data_file
 </code></pre>
 <h3 id="step-2---build-the-text-index">Step 2 - Build the Text Index</h3>
 <p>You can then build the text index with the <code>jena.textindexer</code> tool:</p>
 <pre><code>java -cp $FUSEKI_HOME/fuseki-server.jar jena.textindexer --desc=assembler_file
 </code></pre>
 <p>Because a Fuseki assembler description can have several datasets descriptions,
 and several text indexes, it may be necessary to extract a single dataset and index description
 into a separate assembler file for use in loading.</p>
 <h4 id="updating-the-index">Updating the index</h4>
 <p>If you allow updates to the dataset through Fuseki, the configured index
 will automatically be updated on every modification.  This means that you
 do not have to run the above mentioned <code>jena.textindexer</code> after updates,
 only when you want to rebuild the index from scratch.</p>
 <h1 id="configuring-alternative-textdocproducers">Configuring Alternative TextDocProducers</h1>
 <h2 id="default-behavior">Default Behavior</h2>
 <p>The <a href="#one-triple-equals-one-document">default behavior</a> when performing text indexing
 is to index a single property as a single field, generating a different <code>Document</code>
 for each indexed triple. This behavior may be augmented by
 writing and configuring an alternative <code>TextDocProducer</code>.</p>
 <p><strong>Please note</strong> that <code>TextDocProducer.change(...)</code> is called once for each triple that is
 <code>ADD</code>ed or <code>DELETE</code>d, and thus can not be directly used to accumulate multiple properties
 for use in composing a single multi-fielded Lucene document. <a href="#multiple-fields-per-document">See below</a>.</p>
 <p>To configure a <code>TextDocProducer</code>, say <code>net.code.MyProducer</code> in a dataset assembly,
 use the property <code>textDocProducer</code>, eg:</p>
 <pre><code>&lt;#ds-with-lucene&gt; rdf:type text:TextDataset;
 	text:index &lt;#indexLucene&gt; ;
 	text:dataset &lt;#ds&gt; ;
 	text:textDocProducer &lt;java:net.code.MyProducer&gt; ;
 	.
 </code></pre>
 <p>where <code>CLASSNAME</code> is the full java class name. It must have either
 a single-argument constructor of type <code>TextIndex</code>, or a two-argument
 constructor <code>(DatasetGraph, TextIndex)</code>. The <code>TextIndex</code> argument
 will be the configured text index, and the <code>DatasetGraph</code> argument
 will be the graph of the configured dataset.</p>
 <p>For example, to explicitly create the default <code>TextDocProducer</code> use:</p>
 <pre><code>...
     text:textDocProducer &lt;java:org.apache.jena.query.text.TextDocProducerTriples&gt; ;
 ...
 </code></pre>
 <p><code>TextDocProducerTriples</code> produces a new document for each subject/field
 added to the dataset, using <code>TextIndex.addEntity(Entity)</code>.</p>
 <h3 id="example">Example</h3>
 <p>The example class below is a <code>TextDocProducer</code> that only indexes
 <code>ADD</code>s of quads for which the subject already had at least one
 property-value. It uses the two-argument constructor to give it
 access to the dataset so that it count the <code>(?G, S, P, ?O)</code> quads
 with that subject and predicate, and delegates the indexing to
 <code>TextDocProducerTriples</code> if there are at least two values for
 that property (one of those values, of course, is the one that
 gives rise to this <code>change()</code>).</p>
 <pre><code>  public class Example extends TextDocProducerTriples {

       final DatasetGraph dg;

       public Example(DatasetGraph dg, TextIndex indexer) {
           super(indexer);
           this.dg = dg;
       }

       public void change(QuadAction qaction, Node g, Node s, Node p, Node o) {
           if (qaction == QuadAction.ADD) {
               if (alreadyHasOne(s, p)) super.change(qaction, g, s, p, o);
           }
       }

       private boolean alreadyHasOne(Node s, Node p) {
           int count = 0;
           Iterator&lt;Quad&gt; quads = dg.find( null, s, p, null );
           while (quads.hasNext()) { quads.next(); count += 1; }
           return count &gt; 1;
       }
   }
 </code></pre>
 <h2 id="multiple-fields-per-document">Multiple fields per document</h2>
 <p>In principle it should be possible to extend Jena to allow for creating documents with
 multiple searchable fields by extending <code>org.apache.jena.sparql.core.DatasetChangesBatched</code>
 such as with <code>org.apache.jena.query.text.TextDocProducerEntities</code>; however, this form of
 extension is not currently (Jena 3.13.1) functional.</p>
 <h2 id="maven-dependency">Maven Dependency</h2>
 <p>The <code>jena-text</code> module is included in Fuseki.  To use it within application code,
 then use the following maven dependency:</p>
 <pre><code>&lt;dependency&gt;
   &lt;groupId&gt;org.apache.jena&lt;/groupId&gt;
   &lt;artifactId&gt;jena-text&lt;/artifactId&gt;
   &lt;version&gt;X.Y.Z&lt;/version&gt;
 &lt;/dependency&gt;
 </code></pre>
 <p>adjusting the version <code>X.Y.Z</code> as necessary.  This will automatically
 include a compatible version of Lucene.</p>
 <p>For Elasticsearch implementation, you can include the following Maven Dependency:</p>
 <pre><code>&lt;dependency&gt;
   &lt;groupId&gt;org.apache.jena&lt;/groupId&gt;
   &lt;artifactId&gt;jena-text-es&lt;/artifactId&gt;
   &lt;version&gt;X.Y.Z&lt;/version&gt;
 &lt;/dependency&gt;
 </code></pre>
 <p>adjusting the version <code>X.Y.Z</code> as necessary.</p>

   </article>

   <aside class="text-muted align-self-start mb-3 mb-xl-5 p-0 d-none d-xl-flex flex-column sticky-top">
     <h2 class="h6 sticky-top m-0 p-2 bg-body-tertiary">On this page</h2>
     <nav id="TableOfContents">
   <ul>
     <li><a href="#architecture">Architecture</a>
       <ul>
         <li><a href="#one-triple-equals-one-document">One triple equals one document</a></li>
         <li><a href="#one-document-equals-one-entity">One document equals one entity</a>
           <ul>
             <li><a href="#external-content">External content</a></li>
           </ul>
         </li>
         <li><a href="#external-applications">External applications</a></li>
         <li><a href="#document-structure">Document structure</a></li>
       </ul>
     </li>
     <li><a href="#query-with-sparql">Query with SPARQL</a>
       <ul>
         <li><a href="#syntax">Syntax</a>
           <ul>
             <li><a href="#input-arguments">Input arguments:</a></li>
             <li><a href="#output-arguments">Output arguments:</a></li>
           </ul>
         </li>
         <li><a href="#query-strings">Query strings</a>
           <ul>
             <li><a href="#simple-queries">Simple queries</a></li>
             <li><a href="#queries-with-language-tags">Queries with language tags</a></li>
             <li><a href="#queries-that-retrieve-literals">Queries that retrieve literals</a></li>
             <li><a href="#queries-with-graphs">Queries with graphs</a></li>
             <li><a href="#queries-across-multiple-fields">Queries across multiple <code>Field</code>s</a></li>
             <li><a href="#queries-with-boolean-operators-and-term-modifiers">Queries with <em>Boolean Operators</em> and <em>Term Modifiers</em></a></li>
             <li><a href="#highlighting">Highlighting</a></li>
           </ul>
         </li>
         <li><a href="#good-practice">Good practice</a>
           <ul>
             <li><a href="#query-pattern-1--find-in-the-text-index-and-refine-results">Query pattern 1 – Find in the text index and refine results</a></li>
             <li><a href="#query-pattern-2--filter-results-via-the-text-index">Query pattern 2 – Filter results via the text index</a></li>
           </ul>
         </li>
       </ul>
     </li>
     <li><a href="#configuration">Configuration</a>
       <ul>
         <li><a href="#text-dataset-assembler">Text Dataset Assembler</a>
           <ul>
             <li><a href="#lists-of-indexed-properties">Lists of Indexed Properties</a></li>
           </ul>
         </li>
         <li><a href="#entity-map-definition">Entity Map definition</a>
           <ul>
             <li><a href="#default-text-field">Default text field</a></li>
             <li><a href="#entity-field">Entity field</a></li>
             <li><a href="#uid-field-and-automatic-document-deletion">UID Field and automatic document deletion</a></li>
             <li><a href="#language-field">Language Field</a></li>
             <li><a href="#graph-field">Graph Field</a></li>
             <li><a href="#the-analyzer-map">The Analyzer Map</a></li>
           </ul>
         </li>
         <li><a href="#configuring-an-analyzer">Configuring an Analyzer</a>
           <ul>
             <li><a href="#configurableanalyzer">ConfigurableAnalyzer</a></li>
             <li><a href="#analyzer-for-query">Analyzer for Query</a></li>
             <li><a href="#alternative-query-parsers">Alternative Query Parsers</a></li>
           </ul>
         </li>
         <li><a href="#configuration-by-code">Configuration by Code</a></li>
         <li><a href="#graph-specific-indexing">Graph-specific Indexing</a></li>
         <li><a href="#linguistic-support-with-lucene-index">Linguistic support with Lucene index</a>
           <ul>
             <li><a href="#explicit-language-field-in-the-index">Explicit Language Field in the Index</a></li>
             <li><a href="#sparql-linguistic-clause-forms">SPARQL Linguistic Clause Forms</a></li>
             <li><a href="#localizedanalyzer">LocalizedAnalyzer</a></li>
             <li><a href="#multilingual-support">Multilingual Support</a></li>
           </ul>
         </li>
         <li><a href="#generic-and-defined-analyzer-support">Generic and Defined Analyzer Support</a>
           <ul>
             <li><a href="#generic-analyzers-tokenizers-and-filters">Generic Analyzers, Tokenizers and Filters</a></li>
             <li><a href="#defined-analyzers">Defined Analyzers</a></li>
             <li><a href="#extending-multilingual-support">Extending multilingual support</a></li>
             <li><a href="#multilingual-enhancements-for-multi-encoding-searches">Multilingual enhancements for multi-encoding searches</a></li>
             <li><a href="#naming-analyzers-for-later-use">Naming analyzers for later use</a></li>
           </ul>
         </li>
         <li><a href="#storing-literal-values">Storing Literal Values</a></li>
       </ul>
     </li>
     <li><a href="#working-with-fuseki">Working with Fuseki</a></li>
     <li><a href="#building-a-text-index">Building a Text Index</a>
       <ul>
         <li><a href="#step-1---building-a-tdb-dataset">Step 1 - Building a TDB dataset</a></li>
         <li><a href="#step-2---build-the-text-index">Step 2 - Build the Text Index</a>
           <ul>
             <li><a href="#updating-the-index">Updating the index</a></li>
           </ul>
         </li>
       </ul>
     </li>
   </ul>

   <ul>
     <li><a href="#default-behavior">Default Behavior</a>
       <ul>
         <li><a href="#example">Example</a></li>
       </ul>
     </li>
     <li><a href="#multiple-fields-per-document">Multiple fields per document</a></li>
     <li><a href="#maven-dependency">Maven Dependency</a></li>
   </ul>
 </nav>
   </aside>
 </main>

         </div>
     </div>
 </div>

 <footer class="bd-footer py-4 py-md-5 mt-4 mt-lg-5 bg-body-tertiary">
     <div class="container" style="font-size:80%" >
         <p>
             Copyright &copy; 2011&ndash;2025 The Apache Software Foundation, Licensed under the
             <a href="https://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>.
         </p>
         <p>
             Apache Jena, Jena, the Apache Jena project logo, Apache and the Apache feather logos are trademarks of
             The Apache Software Foundation.
             <br/>
           <a href="https://privacy.apache.org/policies/privacy-policy-public.html"
              >Apache Software Foundation Privacy Policy</a>.
         </p>
     </div>
 </footer>

 <script src="/js/popper.min.js.js" type="text/javascript"></script>
 <script src="/js/bootstrap.min.js" type="text/javascript"></script>
 <script src="/js/improve.js" type="text/javascript"></script>

 <script type="text/javascript">
 (function() {
     'use strict'


     const links = document.querySelectorAll(`a[href="${window.location.pathname}"]`)
     if (links !== undefined && links !== null) {
         for (const link of links) {

             link.classList.add('active')
             let parentElement = link.parentElement
             let count = 0
             const levelsLimit = 4


             while (['UL', 'LI'].includes(parentElement.tagName) && count <= levelsLimit) {
                 if (parentElement.tagName === 'LI') {


                     parentElement.querySelector('a:first-child').classList.add('active')
                 }
                 parentElement = parentElement.parentElement
                 count++
             }
         }
     }
 })()
 </script>

 </body>
 </html>