blob: 87ebc3ee69dbc5b082d5e170f5b8b5b7e707718f [file] [log] [blame]
<!DOCTYPE html>
<html lang="en" data-content_root="../">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<title>Configuration &#8212; Apache Arrow DataFusion documentation</title>
<link href="../_static/styles/theme.css?digest=1999514e3f237ded88cf" rel="stylesheet">
<link href="../_static/styles/pydata-sphinx-theme.css?digest=1999514e3f237ded88cf" rel="stylesheet">
<link rel="stylesheet"
href="../_static/vendor/fontawesome/5.13.0/css/all.min.css">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2">
<link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=8f2a1f02" />
<link rel="stylesheet" type="text/css" href="../_static/styles/pydata-sphinx-theme.css?v=1140d252" />
<link rel="stylesheet" type="text/css" href="../_static/graphviz.css?v=4ae1632d" />
<link rel="stylesheet" type="text/css" href="../_static/theme_overrides.css?v=dca7052a" />
<link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=1999514e3f237ded88cf">
<script src="../_static/documentation_options.js?v=8a448e45"></script>
<script src="../_static/doctools.js?v=9bcbadda"></script>
<script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
<link rel="index" title="Index" href="../genindex.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="SQL" href="sql.html" />
<link rel="prev" title="Custom Table Provider" href="io/table_provider.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="en">
<!-- Google Analytics -->
</head>
<body data-spy="scroll" data-target="#bd-toc-nav" data-offset="80">
<div class="container-fluid" id="banner"></div>
<div class="container-xl">
<div class="row">
<!-- Only show if we have sidebars configured, else just a small margin -->
<div class="col-12 col-md-3 bd-sidebar">
<div class="sidebar-start-items">
<a class="navbar-brand" href="../index.html">
<img src="../_static/images/2x_bgwhite_original.png" class="logo" alt="logo">
</a>
<form class="bd-search d-flex align-items-center" action="../search.html" method="get">
<i class="icon fas fa-search"></i>
<input type="search" class="form-control" name="q" id="search-input" placeholder="Search the docs ..." aria-label="Search the docs ..." autocomplete="off" >
</form>
<nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation">
<div class="bd-toc-item active">
<p aria-level="2" class="caption" role="heading">
<span class="caption-text">
LINKS
</span>
</p>
<ul class="nav bd-sidenav">
<li class="toctree-l1">
<a class="reference external" href="https://github.com/apache/datafusion-python">
Github and Issue Tracker
</a>
</li>
<li class="toctree-l1">
<a class="reference external" href="https://docs.rs/datafusion/latest/datafusion/">
Rust's API Docs
</a>
</li>
<li class="toctree-l1">
<a class="reference external" href="https://github.com/apache/datafusion/blob/main/CODE_OF_CONDUCT.md">
Code of conduct
</a>
</li>
<li class="toctree-l1">
<a class="reference external" href="https://github.com/apache/datafusion-python/tree/main/examples">
Examples
</a>
</li>
</ul>
<p aria-level="2" class="caption" role="heading">
<span class="caption-text">
USER GUIDE
</span>
</p>
<ul class="current nav bd-sidenav">
<li class="toctree-l1">
<a class="reference internal" href="introduction.html">
Introduction
</a>
</li>
<li class="toctree-l1">
<a class="reference internal" href="basics.html">
Concepts
</a>
</li>
<li class="toctree-l1">
<a class="reference internal" href="data-sources.html">
Data Sources
</a>
</li>
<li class="toctree-l1 has-children">
<a class="reference internal" href="dataframe/index.html">
DataFrames
</a>
<input class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" type="checkbox"/>
<label for="toctree-checkbox-1">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l2">
<a class="reference internal" href="dataframe/rendering.html">
HTML Rendering in Jupyter
</a>
</li>
</ul>
</li>
<li class="toctree-l1 has-children">
<a class="reference internal" href="common-operations/index.html">
Common Operations
</a>
<input class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" type="checkbox"/>
<label for="toctree-checkbox-2">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l2">
<a class="reference internal" href="common-operations/views.html">
Registering Views
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="common-operations/basic-info.html">
Basic Operations
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="common-operations/select-and-filter.html">
Column Selections
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="common-operations/expressions.html">
Expressions
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="common-operations/joins.html">
Joins
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="common-operations/functions.html">
Functions
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="common-operations/aggregations.html">
Aggregation
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="common-operations/windows.html">
Window Functions
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="common-operations/udf-and-udfa.html">
User-Defined Functions
</a>
</li>
</ul>
</li>
<li class="toctree-l1 has-children">
<a class="reference internal" href="io/index.html">
IO
</a>
<input class="toctree-checkbox" id="toctree-checkbox-3" name="toctree-checkbox-3" type="checkbox"/>
<label for="toctree-checkbox-3">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l2">
<a class="reference internal" href="io/arrow.html">
Arrow
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="io/avro.html">
Avro
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="io/csv.html">
CSV
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="io/json.html">
JSON
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="io/parquet.html">
Parquet
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="io/table_provider.html">
Custom Table Provider
</a>
</li>
</ul>
</li>
<li class="toctree-l1 current active">
<a class="current reference internal" href="#">
Configuration
</a>
</li>
<li class="toctree-l1">
<a class="reference internal" href="sql.html">
SQL
</a>
</li>
</ul>
<p aria-level="2" class="caption" role="heading">
<span class="caption-text">
CONTRIBUTOR GUIDE
</span>
</p>
<ul class="nav bd-sidenav">
<li class="toctree-l1">
<a class="reference internal" href="../contributor-guide/introduction.html">
Introduction
</a>
</li>
<li class="toctree-l1">
<a class="reference internal" href="../contributor-guide/ffi.html">
Python Extensions
</a>
</li>
</ul>
<p aria-level="2" class="caption" role="heading">
<span class="caption-text">
API
</span>
</p>
<ul class="nav bd-sidenav">
<li class="toctree-l1 has-children">
<a class="reference internal" href="../autoapi/index.html">
API Reference
</a>
<input class="toctree-checkbox" id="toctree-checkbox-4" name="toctree-checkbox-4" type="checkbox"/>
<label for="toctree-checkbox-4">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l2 has-children">
<a class="reference internal" href="../autoapi/datafusion/index.html">
datafusion
</a>
<input class="toctree-checkbox" id="toctree-checkbox-5" name="toctree-checkbox-5" type="checkbox"/>
<label for="toctree-checkbox-5">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l3">
<a class="reference internal" href="../autoapi/datafusion/catalog/index.html">
datafusion.catalog
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../autoapi/datafusion/context/index.html">
datafusion.context
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../autoapi/datafusion/dataframe/index.html">
datafusion.dataframe
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../autoapi/datafusion/dataframe_formatter/index.html">
datafusion.dataframe_formatter
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../autoapi/datafusion/expr/index.html">
datafusion.expr
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../autoapi/datafusion/functions/index.html">
datafusion.functions
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../autoapi/datafusion/html_formatter/index.html">
datafusion.html_formatter
</a>
</li>
<li class="toctree-l3 has-children">
<a class="reference internal" href="../autoapi/datafusion/input/index.html">
datafusion.input
</a>
<input class="toctree-checkbox" id="toctree-checkbox-6" name="toctree-checkbox-6" type="checkbox"/>
<label for="toctree-checkbox-6">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l4">
<a class="reference internal" href="../autoapi/datafusion/input/base/index.html">
datafusion.input.base
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="../autoapi/datafusion/input/location/index.html">
datafusion.input.location
</a>
</li>
</ul>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../autoapi/datafusion/io/index.html">
datafusion.io
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../autoapi/datafusion/object_store/index.html">
datafusion.object_store
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../autoapi/datafusion/plan/index.html">
datafusion.plan
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../autoapi/datafusion/record_batch/index.html">
datafusion.record_batch
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../autoapi/datafusion/substrait/index.html">
datafusion.substrait
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../autoapi/datafusion/unparser/index.html">
datafusion.unparser
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../autoapi/datafusion/user_defined/index.html">
datafusion.user_defined
</a>
</li>
</ul>
</li>
</ul>
</li>
</ul>
</div>
</nav>
</div>
<div class="sidebar-end-items">
</div>
</div>
<div class="d-none d-xl-block col-xl-2 bd-toc">
<div class="toc-item">
<div class="tocsection onthispage pt-5 pb-3">
<i class="fas fa-list"></i> On this page
</div>
<nav id="bd-toc-nav">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry">
<a class="reference internal nav-link" href="#maximizing-cpu-usage">
Maximizing CPU Usage
</a>
<ul class="visible nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry">
<a class="reference internal nav-link" href="#benchmark-example">
Benchmark Example
</a>
<ul class="nav section-nav flex-column">
<li class="toc-h4 nav-item toc-entry">
<a class="reference internal nav-link" href="#important-considerations">
Important Considerations
</a>
</li>
</ul>
</li>
</ul>
</li>
</ul>
</nav>
</div>
<div class="toc-item">
</div>
</div>
<main class="col-12 col-md-9 col-xl-7 py-md-5 pl-md-5 pr-md-4 bd-content" role="main">
<div>
<section id="configuration">
<h1>Configuration<a class="headerlink" href="#configuration" title="Link to this heading"></a></h1>
<p>Let’s look at how we can configure DataFusion. When creating a <a class="reference internal" href="../autoapi/datafusion/context/index.html#datafusion.context.SessionContext" title="datafusion.context.SessionContext"><code class="xref py py-class docutils literal notranslate"><span class="pre">SessionContext</span></code></a>, you can pass in
a <a class="reference internal" href="../autoapi/datafusion/context/index.html#datafusion.context.SessionConfig" title="datafusion.context.SessionConfig"><code class="xref py py-class docutils literal notranslate"><span class="pre">SessionConfig</span></code></a> and <a class="reference internal" href="../autoapi/datafusion/context/index.html#datafusion.context.RuntimeEnvBuilder" title="datafusion.context.RuntimeEnvBuilder"><code class="xref py py-class docutils literal notranslate"><span class="pre">RuntimeEnvBuilder</span></code></a> object. These two cover a wide range of options.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">datafusion</span><span class="w"> </span><span class="kn">import</span> <span class="n">RuntimeEnvBuilder</span><span class="p">,</span> <span class="n">SessionConfig</span><span class="p">,</span> <span class="n">SessionContext</span>
<span class="c1"># create a session context with default settings</span>
<span class="n">ctx</span> <span class="o">=</span> <span class="n">SessionContext</span><span class="p">()</span>
<span class="nb">print</span><span class="p">(</span><span class="n">ctx</span><span class="p">)</span>
<span class="c1"># create a session context with explicit runtime and config settings</span>
<span class="n">runtime</span> <span class="o">=</span> <span class="n">RuntimeEnvBuilder</span><span class="p">()</span><span class="o">.</span><span class="n">with_disk_manager_os</span><span class="p">()</span><span class="o">.</span><span class="n">with_fair_spill_pool</span><span class="p">(</span><span class="mi">10000000</span><span class="p">)</span>
<span class="n">config</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">SessionConfig</span><span class="p">()</span>
<span class="o">.</span><span class="n">with_create_default_catalog_and_schema</span><span class="p">(</span><span class="kc">True</span><span class="p">)</span>
<span class="o">.</span><span class="n">with_default_catalog_and_schema</span><span class="p">(</span><span class="s2">&quot;foo&quot;</span><span class="p">,</span> <span class="s2">&quot;bar&quot;</span><span class="p">)</span>
<span class="o">.</span><span class="n">with_target_partitions</span><span class="p">(</span><span class="mi">8</span><span class="p">)</span>
<span class="o">.</span><span class="n">with_information_schema</span><span class="p">(</span><span class="kc">True</span><span class="p">)</span>
<span class="o">.</span><span class="n">with_repartition_joins</span><span class="p">(</span><span class="kc">False</span><span class="p">)</span>
<span class="o">.</span><span class="n">with_repartition_aggregations</span><span class="p">(</span><span class="kc">False</span><span class="p">)</span>
<span class="o">.</span><span class="n">with_repartition_windows</span><span class="p">(</span><span class="kc">False</span><span class="p">)</span>
<span class="o">.</span><span class="n">with_parquet_pruning</span><span class="p">(</span><span class="kc">False</span><span class="p">)</span>
<span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="s2">&quot;datafusion.execution.parquet.pushdown_filters&quot;</span><span class="p">,</span> <span class="s2">&quot;true&quot;</span><span class="p">)</span>
<span class="p">)</span>
<span class="n">ctx</span> <span class="o">=</span> <span class="n">SessionContext</span><span class="p">(</span><span class="n">config</span><span class="p">,</span> <span class="n">runtime</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="n">ctx</span><span class="p">)</span>
</pre></div>
</div>
<section id="maximizing-cpu-usage">
<h2>Maximizing CPU Usage<a class="headerlink" href="#maximizing-cpu-usage" title="Link to this heading"></a></h2>
<p>DataFusion uses partitions to parallelize work. For small queries the
default configuration (number of CPU cores) is often sufficient, but to
fully utilize available hardware you can tune how many partitions are
created and when DataFusion will repartition data automatically.</p>
<p>Configure a <code class="docutils literal notranslate"><span class="pre">SessionContext</span></code> with a higher partition count:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">datafusion</span><span class="w"> </span><span class="kn">import</span> <span class="n">SessionConfig</span><span class="p">,</span> <span class="n">SessionContext</span>
<span class="c1"># allow up to 16 concurrent partitions</span>
<span class="n">config</span> <span class="o">=</span> <span class="n">SessionConfig</span><span class="p">()</span><span class="o">.</span><span class="n">with_target_partitions</span><span class="p">(</span><span class="mi">16</span><span class="p">)</span>
<span class="n">ctx</span> <span class="o">=</span> <span class="n">SessionContext</span><span class="p">(</span><span class="n">config</span><span class="p">)</span>
</pre></div>
</div>
<p>Automatic repartitioning for joins, aggregations, window functions and
other operations can be enabled to increase parallelism:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">config</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">SessionConfig</span><span class="p">()</span>
<span class="o">.</span><span class="n">with_target_partitions</span><span class="p">(</span><span class="mi">16</span><span class="p">)</span>
<span class="o">.</span><span class="n">with_repartition_joins</span><span class="p">(</span><span class="kc">True</span><span class="p">)</span>
<span class="o">.</span><span class="n">with_repartition_aggregations</span><span class="p">(</span><span class="kc">True</span><span class="p">)</span>
<span class="o">.</span><span class="n">with_repartition_windows</span><span class="p">(</span><span class="kc">True</span><span class="p">)</span>
<span class="p">)</span>
</pre></div>
</div>
<p>Manual repartitioning is available on DataFrames when you need precise
control:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">datafusion</span><span class="w"> </span><span class="kn">import</span> <span class="n">col</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">ctx</span><span class="o">.</span><span class="n">read_parquet</span><span class="p">(</span><span class="s2">&quot;data.parquet&quot;</span><span class="p">)</span>
<span class="c1"># Evenly divide into 16 partitions</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">repartition</span><span class="p">(</span><span class="mi">16</span><span class="p">)</span>
<span class="c1"># Or partition by the hash of a column</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">repartition_by_hash</span><span class="p">(</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;a&quot;</span><span class="p">),</span> <span class="n">num</span><span class="o">=</span><span class="mi">16</span><span class="p">)</span>
<span class="n">result</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
</pre></div>
</div>
<section id="benchmark-example">
<h3>Benchmark Example<a class="headerlink" href="#benchmark-example" title="Link to this heading"></a></h3>
<p>The repository includes a benchmark script that demonstrates how to maximize CPU usage
with DataFusion. The <code class="code docutils literal notranslate"><span class="pre">benchmarks/max_cpu_usage.py</span></code> script shows a practical example
of configuring DataFusion for optimal parallelism.</p>
<p>You can run the benchmark script to see the impact of different configuration settings:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="c1"># Run with default settings (uses all CPU cores)</span>
python<span class="w"> </span>benchmarks/max_cpu_usage.py
<span class="c1"># Run with specific number of rows and partitions</span>
python<span class="w"> </span>benchmarks/max_cpu_usage.py<span class="w"> </span>--rows<span class="w"> </span><span class="m">5000000</span><span class="w"> </span>--partitions<span class="w"> </span><span class="m">16</span>
<span class="c1"># See all available options</span>
python<span class="w"> </span>benchmarks/max_cpu_usage.py<span class="w"> </span>--help
</pre></div>
</div>
<p>Here’s an example showing the performance difference between single and multiple partitions:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="c1"># Single partition - slower processing</span>
$<span class="w"> </span>python<span class="w"> </span>benchmarks/max_cpu_usage.py<span class="w"> </span>--rows<span class="o">=</span><span class="m">10000000</span><span class="w"> </span>--partitions<span class="w"> </span><span class="m">1</span>
Processed<span class="w"> </span><span class="m">10000000</span><span class="w"> </span>rows<span class="w"> </span>using<span class="w"> </span><span class="m">1</span><span class="w"> </span>partitions<span class="w"> </span><span class="k">in</span><span class="w"> </span><span class="m">0</span>.107s
<span class="c1"># Multiple partitions - faster processing</span>
$<span class="w"> </span>python<span class="w"> </span>benchmarks/max_cpu_usage.py<span class="w"> </span>--rows<span class="o">=</span><span class="m">10000000</span><span class="w"> </span>--partitions<span class="w"> </span><span class="m">10</span>
Processed<span class="w"> </span><span class="m">10000000</span><span class="w"> </span>rows<span class="w"> </span>using<span class="w"> </span><span class="m">10</span><span class="w"> </span>partitions<span class="w"> </span><span class="k">in</span><span class="w"> </span><span class="m">0</span>.038s
</pre></div>
</div>
<p>This example demonstrates nearly 3x performance improvement (0.107s vs 0.038s) when using
10 partitions instead of 1, showcasing how proper partitioning can significantly improve
CPU utilization and query performance.</p>
<p>The script demonstrates several key optimization techniques:</p>
<ol class="arabic simple">
<li><p><strong>Higher target partition count</strong>: Uses <code class="code docutils literal notranslate"><span class="pre">with_target_partitions()</span></code> to set the number of concurrent partitions</p></li>
<li><p><strong>Automatic repartitioning</strong>: Enables repartitioning for joins, aggregations, and window functions</p></li>
<li><p><strong>Manual repartitioning</strong>: Uses <code class="code docutils literal notranslate"><span class="pre">repartition()</span></code> to ensure all partitions are utilized</p></li>
<li><p><strong>CPU-intensive operations</strong>: Performs aggregations that can benefit from parallelization</p></li>
</ol>
<p>The benchmark creates synthetic data and measures the time taken to perform a sum aggregation
across the specified number of partitions. This helps you understand how partition configuration
affects performance on your specific hardware.</p>
<section id="important-considerations">
<h4>Important Considerations<a class="headerlink" href="#important-considerations" title="Link to this heading"></a></h4>
<p>The provided benchmark script demonstrates partitioning concepts using synthetic in-memory data
and simple aggregation operations. While useful for understanding basic configuration principles,
actual performance in production environments may vary significantly based on numerous factors:</p>
<p><strong>Data Sources and I/O Characteristics:</strong></p>
<ul class="simple">
<li><p><strong>Table providers</strong>: Performance differs greatly between Parquet files, CSV files, databases, and cloud storage</p></li>
<li><p><strong>Storage type</strong>: Local SSD, network-attached storage, and cloud storage have vastly different characteristics</p></li>
<li><p><strong>Network latency</strong>: Remote data sources introduce additional latency considerations</p></li>
<li><p><strong>File sizes and distribution</strong>: Large files may benefit differently from partitioning than many small files</p></li>
</ul>
<p><strong>Query and Workload Characteristics:</strong></p>
<ul class="simple">
<li><p><strong>Operation complexity</strong>: Simple aggregations versus complex joins, window functions, or nested queries</p></li>
<li><p><strong>Data distribution</strong>: Skewed data may not partition evenly, affecting parallel efficiency</p></li>
<li><p><strong>Memory usage</strong>: Large datasets may require different memory management strategies</p></li>
<li><p><strong>Concurrent workloads</strong>: Multiple queries running simultaneously affect resource allocation</p></li>
</ul>
<p><strong>Hardware and Environment Factors:</strong></p>
<ul class="simple">
<li><p><strong>CPU architecture</strong>: Different processors have varying parallel processing capabilities</p></li>
<li><p><strong>Available memory</strong>: Limited RAM may require different optimization strategies</p></li>
<li><p><strong>System load</strong>: Other applications competing for resources affect DataFusion performance</p></li>
</ul>
<p><strong>Recommendations for Production Use:</strong></p>
<p>To optimize DataFusion for your specific use case, it is strongly recommended to:</p>
<ol class="arabic simple">
<li><p><strong>Create custom benchmarks</strong> using your actual data sources, formats, and query patterns</p></li>
<li><p><strong>Test with representative data volumes</strong> that match your production workloads</p></li>
<li><p><strong>Measure end-to-end performance</strong> including data loading, processing, and result handling</p></li>
<li><p><strong>Evaluate different configuration combinations</strong> for your specific hardware and workload</p></li>
<li><p><strong>Monitor resource utilization</strong> (CPU, memory, I/O) to identify bottlenecks in your environment</p></li>
</ol>
<p>This approach will provide more accurate insights into how DataFusion configuration options
will impact your particular applications and infrastructure.</p>
<p>For more information about available <a class="reference internal" href="../autoapi/datafusion/context/index.html#datafusion.context.SessionConfig" title="datafusion.context.SessionConfig"><code class="xref py py-class docutils literal notranslate"><span class="pre">SessionConfig</span></code></a> options, see the <a class="reference external" href="https://arrow.apache.org/datafusion/user-guide/configs.html">rust DataFusion Configuration guide</a>,
and about <code class="code docutils literal notranslate"><span class="pre">RuntimeEnvBuilder</span></code> options in the rust <a class="reference external" href="https://docs.rs/datafusion/latest/datafusion/execution/runtime_env/struct.RuntimeEnvBuilder.html">online API documentation</a>.</p>
</section>
</section>
</section>
</section>
</div>
<!-- Previous / next buttons -->
<div class='prev-next-area'>
<a class='left-prev' id="prev-link" href="io/table_provider.html" title="previous page">
<i class="fas fa-angle-left"></i>
<div class="prev-next-info">
<p class="prev-next-subtitle">previous</p>
<p class="prev-next-title">Custom Table Provider</p>
</div>
</a>
<a class='right-next' id="next-link" href="sql.html" title="next page">
<div class="prev-next-info">
<p class="prev-next-subtitle">next</p>
<p class="prev-next-title">SQL</p>
</div>
<i class="fas fa-angle-right"></i>
</a>
</div>
</main>
</div>
</div>
<script src="../_static/scripts/pydata-sphinx-theme.js?digest=1999514e3f237ded88cf"></script>
<!-- Based on pydata_sphinx_theme/footer.html -->
<footer class="footer mt-5 mt-md-0">
<div class="container">
<div class="footer-item">
<p class="copyright">
&copy; Copyright 2019-2024, Apache Software Foundation.<br>
</p>
</div>
<div class="footer-item">
<p class="sphinx-version">
Created using <a href="http://sphinx-doc.org/">Sphinx</a> 8.1.3.<br>
</p>
</div>
<div class="footer-item">
<p>Apache Arrow DataFusion, Arrow DataFusion, Apache, the Apache feather logo, and the Apache Arrow DataFusion project logo</p>
<p>are either registered trademarks or trademarks of The Apache Software Foundation in the United States and other countries.</p>
</div>
</div>
</footer>
</body>
</html>