blob: a169d0a3ac4655a3aad0870d5b19bbbb25494467 [file] [log] [blame]
<!DOCTYPE html>
<html lang="en">
<head>
<title>Apache Jena - TDB Architecture</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<link href="/css/bootstrap.min.css" rel="stylesheet" media="screen">
<link href="/css/bootstrap-icons.css" rel="stylesheet" media="screen"><link rel="stylesheet" type="text/css" href="https://jena.apache.org/sass/jena.1b17c39a117e22b46db4c66f6395dc27c134a60377d87d2d5745b8600eb69722.css" integrity="sha256-GxfDmhF&#43;IrRttMZvY5XcJ8E0pgN32H0tV0W4YA62lyI=">
<link rel="shortcut icon" href="/images/favicon.ico" />
</head>
<body>
<nav class="navbar navbar-expand-lg bg-body-tertiary" role="navigation">
<div class="container">
<div class="navbar-header">
<button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbarNav" aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<a class="navbar-brand" href="/index.html">
<img class="logo-menu" src="/images/jena-logo/jena-logo-notext-small.png" alt="jena logo">Apache Jena</a>
</div>
<div class="collapse navbar-collapse" id="navbarNav">
<ul class="navbar-nav me-auto mb-2 mb-lg-0">
<li id="homepage" class="nav-item"><a class="nav-link" href="/index.html"><span class="bi-house"></span> Home</a></li>
<li id="download" class="nav-item"><a class="nav-link" href="/download/index.cgi"><span class="bi-download"></span> Download</a></li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" role="button" data-bs-toggle="dropdown" aria-expanded="false"><span class="bi-journal"></span> Learn <b class="caret"></b></a>
<ul class="dropdown-menu">
<li class="dropdown-header">Tutorials</li>
<li><a class="dropdown-item" href="/tutorials/index.html">Overview</a></li>
<li><a class="dropdown-item" href="/documentation/fuseki2/index.html">Fuseki Triplestore</a></li>
<li><a class="dropdown-item" href="/documentation/notes/index.html">How-To's</a></li>
<li><a class="dropdown-item" href="/documentation/query/manipulating_sparql_using_arq.html">Manipulating SPARQL using ARQ</a></li>
<li><a class="dropdown-item" href="/tutorials/rdf_api.html">RDF core API tutorial</a></li>
<li><a class="dropdown-item" href="/tutorials/sparql.html">SPARQL tutorial</a></li>
<li><a class="dropdown-item" href="/tutorials/using_jena_with_eclipse.html">Using Jena with Eclipse</a></li>
<li class="dropdown-divider"></li>
<li class="dropdown-header">References</li>
<li><a class="dropdown-item" href="/documentation/index.html">Overview</a></li>
<li><a class="dropdown-item" href="/documentation/query/index.html">ARQ (SPARQL)</a></li>
<li><a class="dropdown-item" href="/documentation/io/">RDF I/O</a></li>
<li><a class="dropdown-item" href="/documentation/assembler/index.html">Assembler</a></li>
<li><a class="dropdown-item" href="/documentation/tools/index.html">Command-line tools</a></li>
<li><a class="dropdown-item" href="/documentation/rdfs/">Data with RDFS Inferencing</a></li>
<li><a class="dropdown-item" href="/documentation/geosparql/index.html">GeoSPARQL</a></li>
<li><a class="dropdown-item" href="/documentation/inference/index.html">Inference API</a></li>
<li><a class="dropdown-item" href="/documentation/ontology/">Ontology API</a></li>
<li><a class="dropdown-item" href="/documentation/permissions/index.html">Permissions</a></li>
<li><a class="dropdown-item" href="/documentation/extras/querybuilder/index.html">Query Builder</a></li>
<li><a class="dropdown-item" href="/documentation/rdf/index.html">RDF API</a></li>
<li><a class="dropdown-item" href="/documentation/rdfconnection/">RDF Connection - SPARQL API</a></li>
<li><a class="dropdown-item" href="/documentation/rdfstar/index.html">RDF-star</a></li>
<li><a class="dropdown-item" href="/documentation/shacl/index.html">SHACL</a></li>
<li><a class="dropdown-item" href="/documentation/shex/index.html">ShEx</a></li>
<li><a class="dropdown-item" href="/documentation/tdb/index.html">TDB</a></li>
<li><a class="dropdown-item" href="/documentation/tdb2/index.html">TDB2</a></li>
<li><a class="dropdown-item" href="/documentation/query/text-query.html">Text Search</a></li>
</ul>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" role="button" data-bs-toggle="dropdown" aria-expanded="false"><span class="bi-journal-code"></span> Javadoc <b class="caret"></b></a>
<ul class="dropdown-menu">
<li><a class="dropdown-item" href="/documentation/javadoc.html">All Javadoc</a></li>
<li><a class="dropdown-item" href="/documentation/javadoc/arq/">ARQ</a></li>
<li><a class="dropdown-item" href="/documentation/javadoc/fuseki2/">Fuseki</a></li>
<li><a class="dropdown-item" href="/documentation/javadoc/geosparql/">GeoSPARQL</a></li>
<li><a class="dropdown-item" href="/documentation/javadoc/jena/">Jena Core</a></li>
<li><a class="dropdown-item" href="/documentation/javadoc/permissions/">Permissions</a></li>
<li><a class="dropdown-item" href="/documentation/javadoc/extras/querybuilder/">Query Builder</a></li>
<li><a class="dropdown-item" href="/documentation/javadoc/shacl/">SHACL</a></li>
<li><a class="dropdown-item" href="/documentation/javadoc/tdb/">TDB</a></li>
<li><a class="dropdown-item" href="/documentation/javadoc/text/">Text Search</a></li>
</ul>
</li>
</ul>
<form class="d-flex" role="search" action="/search" method="GET">
<div class="input-group">
<input class="form-control border-end-0 border m-0" type="search" name="q" id="search-query" placeholder="Search...." aria-label="Search" style="width: 10rem;">
<button class="btn btn-outline-secondary border-start-0 border" type="submit">
<i class="bi-search"></i>
</button>
</div>
</form>
<ul class="navbar-nav">
<li id="ask" class="nav-item"><a class="nav-link" href="/help_and_support/index.html" title="Ask"><span class="bi-patch-question"></span><span class="text-body d-none d-xxl-inline"> Ask</span></a></li>
<li class="nav-item dropdown">
<a href="#" title="Get involved" class="nav-link dropdown-toggle" role="button" data-bs-toggle="dropdown" aria-expanded="false"><span class="bi-megaphone"></span><span class="text-body d-none d-xxl-inline"> Get involved </span><b class="caret"></b></a>
<ul class="dropdown-menu">
<li><a class="dropdown-item" href="/getting_involved/index.html">Contribute</a></li>
<li><a class="dropdown-item" href="/help_and_support/bugs_and_suggestions.html">Report a bug</a></li>
<li class="dropdown-divider"></li>
<li class="dropdown-header">Project</li>
<li><a class="dropdown-item" href="/about_jena/about.html">About Jena</a></li>
<li><a class="dropdown-item" href="/about_jena/architecture.html">Architecture</a></li>
<li><a class="dropdown-item" href="/about_jena/citing.html">Citing</a></li>
<li><a class="dropdown-item" href="/about_jena/team.html">Project team</a></li>
<li><a class="dropdown-item" href="/about_jena/contributions.html">Related projects</a></li>
<li><a class="dropdown-item" href="/about_jena/roadmap.html">Roadmap</a></li>
<li><a class="dropdown-item" href="/about_jena/security-advisories.html">Security Advisories</a></li>
<li class="dropdown-divider"></li>
<li class="dropdown-header">ASF</li>
<li><a class="dropdown-item" href="https://www.apache.org/">Apache Software Foundation</a></li>
<li><a class="dropdown-item" href="https://www.apache.org/foundation/sponsorship.html">Become a Sponsor</a></li>
<li><a class="dropdown-item" href="https://www.apache.org/licenses/LICENSE-2.0">License</a></li>
<li><a class="dropdown-item" href="https://www.apache.org/security/">Security</a></li>
<li><a class="dropdown-item" href="https://www.apache.org/foundation/thanks.html">Thanks</a></li>
</ul>
</li>
<li class="nav-item" id="edit"><a class="nav-link" href="https://github.com/apache/jena-site/edit/main/source/documentation/tdb/architecture.md" title="Edit this page on GitHub"><span class="bi-pencil-square"></span><span class="text-body d-none d-xxl-inline"> Edit this page</span></a></li>
</ul>
</div>
</div>
</nav>
<div class="container">
<div class="row">
<div class="col-md-12">
<div id="breadcrumbs">
<ol class="breadcrumb mt-4 p-2 bg-body-tertiary">
<li class="breadcrumb-item"><a href='/documentation'>DOCUMENTATION</a></li>
<li class="breadcrumb-item"><a href='/documentation/tdb'>TDB</a></li>
<li class="breadcrumb-item active">ARCHITECTURE.HTML</li>
</ol>
</div>
<h1 class="title">TDB Architecture</h1>
<main class="d-flex flex-xl-row flex-column">
<aside class="text-muted align-self-start mb-3 p-0 d-xl-none d-block">
<h2 class="h6 sticky-top m-0 p-2 bg-body-tertiary">On this page</h2>
<nav id="TableOfContents">
<ul>
<li><a href="#terminology">Terminology</a></li>
<li><a href="#design">Design</a>
<ul>
<li><a href="#the-node-table">The Node Table</a></li>
<li><a href="#triple-and-quad-indexes">Triple and Quad indexes</a></li>
<li><a href="#prefixes-table">Prefixes Table</a></li>
<li><a href="#tdb-btrees">TDB B+Trees</a></li>
<li><a href="#tdb-transactions">Transactions</a></li>
</ul>
</li>
<li><a href="#inline-values">Inline values</a>
<ul>
<li><a href="#tdb2">TDB2</a></li>
<li><a href="#tdb1">TDB1</a></li>
</ul>
</li>
<li><a href="#query-processing">Query Processing</a></li>
<li><a href="#caching-on-32-and-64-bit-java-systems">Caching on 32 and 64 bit Java systems</a></li>
</ul>
</nav>
</aside>
<article class="flex-column me-lg-4">
<p>This page gives an overview of the TDB architecture.
It applies to TDB1 and TDB2 with differences noted.</p>
<h2 id="terminology">Terminology</h2>
<p>Terms like &ldquo;table&rdquo; and &ldquo;index&rdquo; are used in this description. They
don&rsquo;t directly correspond to concepts in SQL, For example, in SQL
terms, there is no triple table; that can be seen as just having
indexes for the table or, alternatively, there are 3 tables, each
of which has a primary key and TDB manages the relationship between
them.</p>
<h2 id="design">Design</h2>
<p>A dataset backed by TDB is stored in a single directory in the
filing system. A dataset consists of</p>
<ul>
<li>The node table</li>
<li>Triple and Quad indexes</li>
<li>The prefixes table</li>
</ul>
<h3 id="the-node-table">The Node Table</h3>
<p>The node table stores the representation of RDF terms (except for
inlined value - see below). It provides two mappings from Node to
NodeId and from NodeId to Node.</p>
<p>The Node to NodeId mapping is used during data loading and when
converting constant terms in queries from their Jena Node
representation to the TDB-specific internal ids.</p>
<p>The NodeId to Node mapping is used to turn query results expressed
as TDB NodeIds into the Jena Node representation and also during
query processing when filters are applied if the whole node
representation is needed for testing (e.g. regex).</p>
<p>Node table implementations usually provide a large cache - the
NodeId to Node mapping is heavily used in query processing yet the
same NodeId can appear in many query results.</p>
<p>NodeIds are 8 byte quantities. The Node to NodeId mapping is based
on hash of the Node (a 128 bit MD5 hash - the length was found not
to major performance factor).</p>
<p>The default storage of the node table is a sequential access file
for the NodeId to Node mapping and a B+Tree for the Node to NodeId
mapping.</p>
<h3 id="triple-and-quad-indexes">Triple and Quad indexes</h3>
<p>Quads are used for named graphs, triples for the default graph.
Triples are held as 3-tuples of NodeIds in triple indexes - quads
as 4-tuples. Otherwise they are handled in the same manner.</p>
<p>The triple table is 3 indexes - there is no distinguished triple
table with secondary indexes. Instead, each index has all the
information about a triple.</p>
<p>The default storage of each indexes</p>
<h3 id="prefixes-table">Prefixes Table</h3>
<p>The prefixes table uses a node table and a index for GPU
(Graph-&gt;Prefix-&gt;URI). It is usually small. It does not take part
in query processing. It provides support for Jena&rsquo;s PrefixMappings
used mainly for presentation and for serialisation of triples in
<a href="http://www.w3.org/TR/REC-rdf-syntax/" title="http://www.w3.org/TR/REC-rdf-syntax/">RDF/XML</a>
or
<a href="http://www.w3.org/TeamSubmission/turtle/" title="http://www.w3.org/TeamSubmission/turtle/">Turtle</a>.</p>
<h3 id="tdb-btrees">TDB B+Trees</h3>
<p>Many of the persistent data structures in TDB use a custom
implementation of
<a href="http://en.wikipedia.org/wiki/B+_tree" title="http://en.wikipedia.org/wiki/B%2B_tree">B+Trees</a>.
The TDB implementation only provides for fixed length key and fixed
length value. There is no use of the value part in triple and quads indexes.</p>
<h3 id="tdb-transactions">Transactions</h3>
<p>Both TDB1 and TDB2 provide database transactions.
The API is described on the <a href="/documentation/txn/" title="Jena Transactions">Jena Transactions page</a>.</p>
<p>When running with transactions, TDB1 and TDB2 provide support for multiple read
and write transactions without application involvement. There will be multiple
readers active, and also a single writer active (referred to as &ldquo;MR+SW&rdquo;). TDB
itself manages multiple writers, queuing them as necessary.</p>
<p>To support transactions, TDB2 uses copy-on-write MVCC data structures internally.</p>
<p>TDB1 can run non-transactionally but the application is responsible for ensuring
that there is one writer or several readers, not both. This is referred to as
&ldquo;MRSW&rdquo;. Misuse of TDB1 in non-transactional mode can corrupt the database.</p>
<h2 id="inline-values">Inline values</h2>
<p>Values of certain datatypes are held as part of the NodeId.
The top bit indicates whether the remaining 63 bits are a position in the stored
RDF terms file (high bit is 0) or an encoded value (high bit 1).</p>
<p>By storing the value, the exact lexical form is not recorded. The
integers 01 and 1 will both be treated as the value 1.</p>
<h3 id="tdb2">TDB2</h3>
<p>The TDB2 encoding is as follows:</p>
<ul>
<li>High bit (bit 63) 0 means the node is in the object table (PTR).</li>
<li>High bit (bit 63) 1, bit 62 1: double as 62 bits.</li>
<li>High bit (bit 63) 1, bit 62 0: 6 bits of type, 56 bits of value.</li>
</ul>
<p>If a value would not fit, it will be stored externally so there is no
guarantee that all integers, say, are store inline.</p>
<ul>
<li>Integer format: signed 56 bit number, the type field has the XSD type.</li>
<li>Derived types of integer, each with their own datatype.</li>
<li>Decimal format: 8 bits scale, 48bits of signed valued.</li>
<li>Date and DateTime</li>
<li>Boolean</li>
<li>Float</li>
</ul>
<p>In the case of xsd:double, the standard Java 64 bit format is used except that the range
of the exponent is reduced by 2 bits.</p>
<ul>
<li>bit 63 : sign bit</li>
<li>bits 52-62 : exponent, 11 bits, the power of 2, bias -1023.</li>
<li>bits 0-51 : mantissa (significand) 52 bits (the leading one is not stored).</li>
</ul>
<p>Exponents are 11 bits, with values -1022 to +1023 held as 1 to 2046 (11 bits, bias -1023)
Exponents 0x000 and 0x7ff have a special meaning:</p>
<p>The xsd:dateTime and xsd:date ranges cover about 8000 years from
year zero with a precision down to 1 millisecond. Timezone
information is retained to an accuracy of 15 minutes with special
timezones for Z and for no explicit timezone.</p>
<h3 id="tdb1">TDB1</h3>
<p>The value spaces handled are: xsd:decimal, xsd:integer,
xsd:dateTime, xsd:date and xsd:boolean. Each has its own encoding
to fit in 56 bits. If a node falls outside of the range of values
that can be represented in the 56 bit encoding.</p>
<p>The xsd:dateTime and xsd:date ranges cover about 8000 years from
year zero with a precision down to 1 millisecond. Timezone
information is retained to an accuracy of 15 minutes with special
timezones for Z and for no explicit timezone.</p>
<p>Derived XSD datatypes are held as their base type. The exact
datatype is not retained; the value of the RDF term is.
An input of <code>xsd:int</code> will become <code>xsd:integer</code>.</p>
<h2 id="query-processing">Query Processing</h2>
<p>TDB uses quad-execution rewriting SPARQL algebra <code>(graph...)</code> to blocks of quads
where possible. It extends <code>OpExecutor</code>.
TDB provides low level optimization of basic graph patterns using a
<a href="optimizer.html">statistics based optimizer</a>.</p>
<h2 id="caching-on-32-and-64-bit-java-systems">Caching on 32 and 64 bit Java systems</h2>
<p>TDB runs on both 32-bit and 64-bit Java Virtual Machines. A 64-bit Java Virtual
Machine is the normal mode of use. The same file formats are used on both
systems and database files can be transferred between architectures (no TDB
system should be running for the database at the time of copy). What differs is
the file access mechanism used.</p>
<p>The node table caches are always in the Java heap but otherwise the OS file
system plays an important part in index caching.</p>
<p>The file access mechanism can be set explicitly, but this is not a
good idea for production usage, only for experimentation - see the
<a href="configuration.html#File_Access_Mode" title="TDB/Configuration">File Access mode option</a>.</p>
<p>On 64-bit Java, TDB uses memory mapped files, accessed 8M segments,
and the operating system handles caching between RAM and disk. The
amount of RAM used for file caching increases and decreases as
other application run on the machine. The fewer other programs
running on the machine, the more RAM will be available for file
caching. The mapped address space counts as part of the application
processes memory usage but this space is not part of the Java
heap.</p>
<p>On a 32 bit JVM, this approach does not work because Java
addressing is limited to about 1.5Gbytes (the exact figure is JVM
specific and includes any memory mapped file usage) and this would
limit the size of TDB datasets. Instead, TDB provides an in-heap
LRU cache of B+Tree blocks. Applications should set the JVM heap to
1G or above (within the JVM specific limit).</p>
<p>On 32-bit Java, TDB uses its own file caching to enable large
databases. 32-bit Java limits the address space of the JVM to about
1.5Gbytes (the exact size is JVM-dependent), and this includes
memory mapped files, even though they are not in the Java heap. The
JVM heap size may need to be increased to make space for the disk
caches used by TDB.</p>
</article>
<aside class="text-muted align-self-start mb-3 mb-xl-5 p-0 d-none d-xl-flex flex-column sticky-top">
<h2 class="h6 sticky-top m-0 p-2 bg-body-tertiary">On this page</h2>
<nav id="TableOfContents">
<ul>
<li><a href="#terminology">Terminology</a></li>
<li><a href="#design">Design</a>
<ul>
<li><a href="#the-node-table">The Node Table</a></li>
<li><a href="#triple-and-quad-indexes">Triple and Quad indexes</a></li>
<li><a href="#prefixes-table">Prefixes Table</a></li>
<li><a href="#tdb-btrees">TDB B+Trees</a></li>
<li><a href="#tdb-transactions">Transactions</a></li>
</ul>
</li>
<li><a href="#inline-values">Inline values</a>
<ul>
<li><a href="#tdb2">TDB2</a></li>
<li><a href="#tdb1">TDB1</a></li>
</ul>
</li>
<li><a href="#query-processing">Query Processing</a></li>
<li><a href="#caching-on-32-and-64-bit-java-systems">Caching on 32 and 64 bit Java systems</a></li>
</ul>
</nav>
</aside>
</main>
</div>
</div>
</div>
<footer class="bd-footer py-4 py-md-5 mt-4 mt-lg-5 bg-body-tertiary">
<div class="container" style="font-size:80%" >
<p>
Copyright &copy; 2011&ndash;2024 The Apache Software Foundation, Licensed under the
<a href="https://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>.
</p>
<p>
Apache Jena, Jena, the Apache Jena project logo, Apache and the Apache feather logos are trademarks of
The Apache Software Foundation.
<br/>
<a href="https://privacy.apache.org/policies/privacy-policy-public.html"
>Apache Software Foundation Privacy Policy</a>.
</p>
</div>
</footer>
<script src="/js/popper.min.js.js" type="text/javascript"></script>
<script src="/js/bootstrap.min.js" type="text/javascript"></script>
<script src="/js/improve.js" type="text/javascript"></script>
<script type="text/javascript">
(function() {
'use strict'
const links = document.querySelectorAll(`a[href="${window.location.pathname}"]`)
if (links !== undefined && links !== null) {
for (const link of links) {
link.classList.add('active')
let parentElement = link.parentElement
let count = 0
const levelsLimit = 4
while (['UL', 'LI'].includes(parentElement.tagName) && count <= levelsLimit) {
if (parentElement.tagName === 'LI') {
parentElement.querySelector('a:first-child').classList.add('active')
}
parentElement = parentElement.parentElement
count++
}
}
}
})()
</script>
</body>
</html>