blob: 7718fe9e547e8b214bc74cf18f08bbdbf724ad48 [file] [log] [blame]
<!DOCTYPE html>
<html lang="en" dir=ZgotmplZ>
<head>
<link rel="stylesheet" href="/bootstrap/css/bootstrap.min.css">
<script src="/bootstrap/js/bootstrap.bundle.min.js"></script>
<link rel="stylesheet" type="text/css" href="/font-awesome/css/font-awesome.min.css">
<script src="/js/anchor.min.js"></script>
<script src="/js/flink.js"></script>
<link rel="canonical" href="https://flink.apache.org/2016/05/24/stream-processing-for-everyone-with-sql-and-apache-flink/">
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta name="description" content="The capabilities of open source systems for distributed stream processing have evolved significantly over the last years. Initially, the first systems in the field (notably Apache Storm) provided low latency processing, but were limited to at-least-once guarantees, processing-time semantics, and rather low-level APIs. Since then, several new systems emerged and pushed the state of the art of open source stream processing in several dimensions. Today, users of Apache Flink or Apache Beam can use fluent Scala and Java APIs to implement stream processing jobs that operate in event-time with exactly-once semantics at high throughput and low latency.">
<meta name="theme-color" content="#FFFFFF"><meta property="og:title" content="Stream Processing for Everyone with SQL and Apache Flink" />
<meta property="og:description" content="The capabilities of open source systems for distributed stream processing have evolved significantly over the last years. Initially, the first systems in the field (notably Apache Storm) provided low latency processing, but were limited to at-least-once guarantees, processing-time semantics, and rather low-level APIs. Since then, several new systems emerged and pushed the state of the art of open source stream processing in several dimensions. Today, users of Apache Flink or Apache Beam can use fluent Scala and Java APIs to implement stream processing jobs that operate in event-time with exactly-once semantics at high throughput and low latency." />
<meta property="og:type" content="article" />
<meta property="og:url" content="https://flink.apache.org/2016/05/24/stream-processing-for-everyone-with-sql-and-apache-flink/" /><meta property="article:section" content="posts" />
<meta property="article:published_time" content="2016-05-24T10:00:00+00:00" />
<meta property="article:modified_time" content="2016-05-24T10:00:00+00:00" />
<title>Stream Processing for Everyone with SQL and Apache Flink | Apache Flink</title>
<link rel="manifest" href="/manifest.json">
<link rel="icon" href="/favicon.png" type="image/x-icon">
<link rel="stylesheet" href="/book.min.22eceb4d17baa9cdc0f57345edd6f215a40474022dfee39b63befb5fb3c596b5.css" integrity="sha256-IuzrTRe6qc3A9XNF7dbyFaQEdAIt/uObY777X7PFlrU=">
<script defer src="/en.search.min.2698f0d1b683dae4d6cb071668b310a55ebcf1c48d11410a015a51d90105b53e.js" integrity="sha256-Jpjw0baD2uTWywcWaLMQpV688cSNEUEKAVpR2QEFtT4="></script>
<!--
Made with Book Theme
https://github.com/alex-shpak/hugo-book
-->
<meta name="generator" content="Hugo 0.124.1">
<script>
var _paq = window._paq = window._paq || [];
_paq.push(['disableCookies']);
_paq.push(["setDomains", ["*.flink.apache.org","*.nightlies.apache.org/flink"]]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="//analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '1']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
</head>
<body dir=ZgotmplZ>
<header>
<nav class="navbar navbar-expand-xl">
<div class="container-fluid">
<a class="navbar-brand" href="/">
<img src="/img/logo/png/100/flink_squirrel_100_color.png" alt="Apache Flink" height="47" width="47" class="d-inline-block align-text-middle">
<span>Apache Flink</span>
</a>
<button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbarSupportedContent" aria-controls="navbarSupportedContent" aria-expanded="false" aria-label="Toggle navigation">
<i class="fa fa-bars navbar-toggler-icon"></i>
</button>
<div class="collapse navbar-collapse" id="navbarSupportedContent">
<ul class="navbar-nav">
<li class="nav-item dropdown">
<a class="nav-link dropdown-toggle" href="#" role="button" data-bs-toggle="dropdown" aria-expanded="false">About</a>
<ul class="dropdown-menu">
<li>
<a class="dropdown-item" href="/what-is-flink/flink-architecture/">Architecture</a>
</li>
<li>
<a class="dropdown-item" href="/what-is-flink/flink-applications/">Applications</a>
</li>
<li>
<a class="dropdown-item" href="/what-is-flink/flink-operations/">Operations</a>
</li>
<li>
<a class="dropdown-item" href="/what-is-flink/use-cases/">Use Cases</a>
</li>
<li>
<a class="dropdown-item" href="/what-is-flink/powered-by/">Powered By</a>
</li>
<li>
<a class="dropdown-item" href="/what-is-flink/roadmap/">Roadmap</a>
</li>
<li>
<a class="dropdown-item" href="/what-is-flink/community/">Community & Project Info</a>
</li>
<li>
<a class="dropdown-item" href="/what-is-flink/security/">Security</a>
</li>
<li>
<a class="dropdown-item" href="/what-is-flink/special-thanks/">Special Thanks</a>
</li>
</ul>
</li>
<li class="nav-item dropdown">
<a class="nav-link dropdown-toggle" href="#" role="button" data-bs-toggle="dropdown" aria-expanded="false">Getting Started</a>
<ul class="dropdown-menu">
<li>
<a class="dropdown-item" href="https://nightlies.apache.org/flink/flink-docs-stable/docs/try-flink/local_installation/">With Flink<i class="link fa fa-external-link title" aria-hidden="true"></i>
</a>
</li>
<li>
<a class="dropdown-item" href="https://nightlies.apache.org/flink/flink-kubernetes-operator-docs-stable/docs/try-flink-kubernetes-operator/quick-start/">With Flink Kubernetes Operator<i class="link fa fa-external-link title" aria-hidden="true"></i>
</a>
</li>
<li>
<a class="dropdown-item" href="https://nightlies.apache.org/flink/flink-cdc-docs-stable/docs/get-started/introduction/">With Flink CDC<i class="link fa fa-external-link title" aria-hidden="true"></i>
</a>
</li>
<li>
<a class="dropdown-item" href="https://nightlies.apache.org/flink/flink-ml-docs-stable/docs/try-flink-ml/quick-start/">With Flink ML<i class="link fa fa-external-link title" aria-hidden="true"></i>
</a>
</li>
<li>
<a class="dropdown-item" href="https://nightlies.apache.org/flink/flink-statefun-docs-stable/getting-started/project-setup.html">With Flink Stateful Functions<i class="link fa fa-external-link title" aria-hidden="true"></i>
</a>
</li>
<li>
<a class="dropdown-item" href="https://nightlies.apache.org/flink/flink-docs-stable/docs/learn-flink/overview/">Training Course<i class="link fa fa-external-link title" aria-hidden="true"></i>
</a>
</li>
</ul>
</li>
<li class="nav-item dropdown">
<a class="nav-link dropdown-toggle" href="#" role="button" data-bs-toggle="dropdown" aria-expanded="false">Documentation</a>
<ul class="dropdown-menu">
<li>
<a class="dropdown-item" href="https://nightlies.apache.org/flink/flink-docs-stable/">Flink 1.19 (stable)<i class="link fa fa-external-link title" aria-hidden="true"></i>
</a>
</li>
<li>
<a class="dropdown-item" href="https://nightlies.apache.org/flink/flink-docs-master/">Flink Master (snapshot)<i class="link fa fa-external-link title" aria-hidden="true"></i>
</a>
</li>
<li>
<a class="dropdown-item" href="https://nightlies.apache.org/flink/flink-kubernetes-operator-docs-stable/">Kubernetes Operator 1.8 (latest)<i class="link fa fa-external-link title" aria-hidden="true"></i>
</a>
</li>
<li>
<a class="dropdown-item" href="https://nightlies.apache.org/flink/flink-kubernetes-operator-docs-main">Kubernetes Operator Main (snapshot)<i class="link fa fa-external-link title" aria-hidden="true"></i>
</a>
</li>
<li>
<a class="dropdown-item" href="https://nightlies.apache.org/flink/flink-cdc-docs-stable">CDC 3.0 (stable)<i class="link fa fa-external-link title" aria-hidden="true"></i>
</a>
</li>
<li>
<a class="dropdown-item" href="https://nightlies.apache.org/flink/flink-cdc-docs-master">CDC Master (snapshot)<i class="link fa fa-external-link title" aria-hidden="true"></i>
</a>
</li>
<li>
<a class="dropdown-item" href="https://nightlies.apache.org/flink/flink-ml-docs-stable/">ML 2.3 (stable)<i class="link fa fa-external-link title" aria-hidden="true"></i>
</a>
</li>
<li>
<a class="dropdown-item" href="https://nightlies.apache.org/flink/flink-ml-docs-master">ML Master (snapshot)<i class="link fa fa-external-link title" aria-hidden="true"></i>
</a>
</li>
<li>
<a class="dropdown-item" href="https://nightlies.apache.org/flink/flink-statefun-docs-stable/">Stateful Functions 3.3 (stable)<i class="link fa fa-external-link title" aria-hidden="true"></i>
</a>
</li>
<li>
<a class="dropdown-item" href="https://nightlies.apache.org/flink/flink-statefun-docs-master">Stateful Functions Master (snapshot)<i class="link fa fa-external-link title" aria-hidden="true"></i>
</a>
</li>
</ul>
</li>
<li class="nav-item dropdown">
<a class="nav-link dropdown-toggle" href="#" role="button" data-bs-toggle="dropdown" aria-expanded="false">How to Contribute</a>
<ul class="dropdown-menu">
<li>
<a class="dropdown-item" href="/how-to-contribute/overview/">Overview</a>
</li>
<li>
<a class="dropdown-item" href="/how-to-contribute/contribute-code/">Contribute Code</a>
</li>
<li>
<a class="dropdown-item" href="/how-to-contribute/reviewing-prs/">Review Pull Requests</a>
</li>
<li>
<a class="dropdown-item" href="/how-to-contribute/code-style-and-quality-preamble/">Code Style and Quality Guide</a>
</li>
<li>
<a class="dropdown-item" href="/how-to-contribute/contribute-documentation/">Contribute Documentation</a>
</li>
<li>
<a class="dropdown-item" href="/how-to-contribute/documentation-style-guide/">Documentation Style Guide</a>
</li>
<li>
<a class="dropdown-item" href="/how-to-contribute/improve-website/">Contribute to the Website</a>
</li>
<li>
<a class="dropdown-item" href="/how-to-contribute/getting-help/">Getting Help</a>
</li>
</ul>
</li>
<li class="nav-item">
<a class="nav-link" href="/posts/">Flink Blog</a>
</li>
<li class="nav-item">
<a class="nav-link" href="/downloads/">Downloads</a>
</li>
</ul>
<div class="book-search">
<div class="book-search-spinner hidden">
<i class="fa fa-refresh fa-spin"></i>
</div>
<form class="search-bar d-flex" onsubmit="return false;"su>
<input type="text" id="book-search-input" placeholder="Search" aria-label="Search" maxlength="64" data-hotkeys="s/">
<i class="fa fa-search search"></i>
<i class="fa fa-circle-o-notch fa-spin spinner"></i>
</form>
<div class="book-search-spinner hidden"></div>
<ul id="book-search-results"></ul>
</div>
</div>
</div>
</nav>
<div class="navbar-clearfix"></div>
</header>
<main class="flex">
<section class="container book-page">
<article class="markdown">
<h1>
<a href="/2016/05/24/stream-processing-for-everyone-with-sql-and-apache-flink/">Stream Processing for Everyone with SQL and Apache Flink</a>
</h1>
May 24, 2016 -
<p><p>The capabilities of open source systems for distributed stream processing have evolved significantly over the last years. Initially, the first systems in the field (notably <a href="https://storm.apache.org">Apache Storm</a>) provided low latency processing, but were limited to at-least-once guarantees, processing-time semantics, and rather low-level APIs. Since then, several new systems emerged and pushed the state of the art of open source stream processing in several dimensions. Today, users of Apache Flink or <a href="https://beam.incubator.apache.org">Apache Beam</a> can use fluent Scala and Java APIs to implement stream processing jobs that operate in event-time with exactly-once semantics at high throughput and low latency.</p>
<p>In the meantime, stream processing has taken off in the industry. We are witnessing a rapidly growing interest in stream processing which is reflected by prevalent deployments of streaming processing infrastructure such as <a href="https://kafka.apache.org">Apache Kafka</a> and Apache Flink. The increasing number of available data streams results in a demand for people that can analyze streaming data and turn it into real-time insights. However, stream data analysis requires a special skill set including knowledge of streaming concepts such as the characteristics of unbounded streams, windows, time, and state as well as the skills to implement stream analysis jobs usually against Java or Scala APIs. People with this skill set are rare and hard to find.</p>
<p>About six months ago, the Apache Flink community started an effort to add a SQL interface for stream data analysis. SQL is <em>the</em> standard language to access and process data. Everybody who occasionally analyzes data is familiar with SQL. Consequently, a SQL interface for stream data processing will make this technology accessible to a much wider audience. Moreover, SQL support for streaming data will also enable new use cases such as interactive and ad-hoc stream analysis and significantly simplify many applications including stream ingestion and simple transformations. In this blog post, we report on the current status, architectural design, and future plans of the Apache Flink community to implement support for SQL as a language for analyzing data streams.</p>
<h2 id="where-did-we-come-from">
Where did we come from?
<a class="anchor" href="#where-did-we-come-from">#</a>
</h2>
<p>With the <a href="http://flink.apache.org/news/2015/04/13/release-0.9.0-milestone1.html">0.9.0-milestone1</a> release, Apache Flink added an API to process relational data with SQL-like expressions called the Table API. The central concept of this API is a Table, a structured data set or stream on which relational operations can be applied. The Table API is tightly integrated with the DataSet and DataStream API. A Table can be easily created from a DataSet or DataStream and can also be converted back into a DataSet or DataStream as the following example shows</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-scala" data-lang="scala"><span class="line"><span class="cl"><span class="k">val</span> <span class="n">execEnv</span> <span class="k">=</span> <span class="nc">ExecutionEnvironment</span><span class="o">.</span><span class="n">getExecutionEnvironment</span>
</span></span><span class="line"><span class="cl"><span class="k">val</span> <span class="n">tableEnv</span> <span class="k">=</span> <span class="nc">TableEnvironment</span><span class="o">.</span><span class="n">getTableEnvironment</span><span class="o">(</span><span class="n">execEnv</span><span class="o">)</span>
</span></span><span class="line"><span class="cl">
</span></span><span class="line"><span class="cl"><span class="c1">// obtain a DataSet from somewhere
</span></span></span><span class="line"><span class="cl"><span class="c1"></span><span class="k">val</span> <span class="n">tempData</span><span class="k">:</span> <span class="kt">DataSet</span><span class="o">[(</span><span class="kt">String</span>, <span class="kt">Long</span>, <span class="kt">Double</span><span class="o">)]</span> <span class="k">=</span>
</span></span><span class="line"><span class="cl">
</span></span><span class="line"><span class="cl"><span class="c1">// convert the DataSet to a Table
</span></span></span><span class="line"><span class="cl"><span class="c1"></span><span class="k">val</span> <span class="n">tempTable</span><span class="k">:</span> <span class="kt">Table</span> <span class="o">=</span> <span class="n">tempData</span><span class="o">.</span><span class="n">toTable</span><span class="o">(</span><span class="n">tableEnv</span><span class="o">,</span> &#39;location<span class="o">,</span> &#39;time<span class="o">,</span> &#39;tempF<span class="o">)</span>
</span></span><span class="line"><span class="cl"><span class="c1">// compute your result
</span></span></span><span class="line"><span class="cl"><span class="c1"></span><span class="k">val</span> <span class="n">avgTempCTable</span><span class="k">:</span> <span class="kt">Table</span> <span class="o">=</span> <span class="n">tempTable</span>
</span></span><span class="line"><span class="cl"> <span class="o">.</span><span class="n">where</span><span class="o">(</span>&#39;location<span class="o">.</span><span class="n">like</span><span class="o">(</span><span class="s">&#34;room%&#34;</span><span class="o">))</span>
</span></span><span class="line"><span class="cl"> <span class="o">.</span><span class="n">select</span><span class="o">(</span>
</span></span><span class="line"><span class="cl"> <span class="o">(</span>&#39;time <span class="o">/</span> <span class="o">(</span><span class="mi">3600</span> <span class="o">*</span> <span class="mi">24</span><span class="o">))</span> <span class="n">as</span> &#39;day<span class="o">,</span>
</span></span><span class="line"><span class="cl"> &#39;Location <span class="n">as</span> &#39;room<span class="o">,</span>
</span></span><span class="line"><span class="cl"> <span class="o">((</span>&#39;tempF <span class="o">-</span> <span class="mi">32</span><span class="o">)</span> <span class="o">*</span> <span class="mf">0.556</span><span class="o">)</span> <span class="n">as</span> &#39;tempC
</span></span><span class="line"><span class="cl"> <span class="o">)</span>
</span></span><span class="line"><span class="cl"> <span class="o">.</span><span class="n">groupBy</span><span class="o">(</span>&#39;day<span class="o">,</span> &#39;room<span class="o">)</span>
</span></span><span class="line"><span class="cl"> <span class="o">.</span><span class="n">select</span><span class="o">(</span>&#39;day<span class="o">,</span> &#39;room<span class="o">,</span> &#39;tempC<span class="o">.</span><span class="n">avg</span> <span class="n">as</span> &#39;avgTempC<span class="o">)</span>
</span></span><span class="line"><span class="cl"><span class="c1">// convert result Table back into a DataSet and print it
</span></span></span><span class="line"><span class="cl"><span class="c1"></span><span class="n">avgTempCTable</span><span class="o">.</span><span class="n">toDataSet</span><span class="o">[</span><span class="kt">Row</span><span class="o">].</span><span class="n">print</span><span class="o">()</span>
</span></span></code></pre></div><p>Although the example shows Scala code, there is also an equivalent Java version of the Table API. The following picture depicts the original architecture of the Table API.</p>
<center>
<img src="/img/blog/stream-sql/old-table-api.png" style="width:75%;margin:15px">
</center>
<p>A Table is created from a DataSet or DataStream and transformed into a new Table by applying relational transformations such as <code>filter</code>, <code>join</code>, or <code>select</code> on them. Internally, a logical table operator tree is constructed from the applied Table transformations. When a Table is translated back into a DataSet or DataStream, the respective translator translates the logical operator tree into DataSet or DataStream operators. Expressions like <code>'location.like(&quot;room%&quot;)</code> are compiled into Flink functions via code generation.</p>
<p>However, the original Table API had a few limitations. First of all, it could not stand alone. Table API queries had to be always embedded into a DataSet or DataStream program. Queries against batch Tables did not support outer joins, sorting, and many scalar functions which are commonly used in SQL queries. Queries against streaming tables only supported filters, union, and projections and no aggregations or joins. Also, the translation process did not leverage query optimization techniques except for the physical optimization that is applied to all DataSet programs.</p>
<h2 id="table-api-joining-forces-with-sql">
Table API joining forces with SQL
<a class="anchor" href="#table-api-joining-forces-with-sql">#</a>
</h2>
<p>The discussion about adding support for SQL came up a few times in the Flink community. With Flink 0.9 and the availability of the Table API, code generation for relational expressions, and runtime operators, the foundation for such an extension seemed to be there and SQL support the next logical step. On the other hand, the community was also well aware of the multitude of dedicated &ldquo;SQL-on-Hadoop&rdquo; solutions in the open source landscape (<a href="https://hive.apache.org">Apache Hive</a>, <a href="https://drill.apache.org">Apache Drill</a>, <a href="http://impala.io">Apache Impala</a>, <a href="https://tajo.apache.org">Apache Tajo</a>, just to name a few). Given these alternatives, we figured that time would be better spent improving Flink in other ways than implementing yet another SQL-on-Hadoop solution.</p>
<p>However, with the growing popularity of stream processing and the increasing adoption of Flink in this area, the Flink community saw the need for a simpler API to enable more users to analyze streaming data. About half a year ago, we decided to take the Table API to the next level, extend the stream processing capabilities of the Table API, and add support for SQL on streaming data. What we came up with was a revised architecture for a Table API that supports SQL (and Table API) queries on streaming and static data sources. We did not want to reinvent the wheel and decided to build the new Table API on top of <a href="https://calcite.apache.org">Apache Calcite</a>, a popular SQL parser and optimizer framework. Apache Calcite is used by many projects including Apache Hive, Apache Drill, Cascading, and many <a href="https://calcite.apache.org/docs/powered_by.html">more</a>. Moreover, the Calcite community put <a href="https://calcite.apache.org/docs/stream.html">SQL on streams</a> on their roadmap which makes it a perfect fit for Flink&rsquo;s SQL interface.</p>
<p>Calcite is central in the new design as the following architecture sketch shows:</p>
<center>
<img src="/img/blog/stream-sql/new-table-api.png" style="width:75%;margin:15px">
</center>
<p>The new architecture features two integrated APIs to specify relational queries, the Table API and SQL. Queries of both APIs are validated against a catalog of registered tables and converted into Calcite&rsquo;s representation for logical plans. In this representation, stream and batch queries look exactly the same. Next, Calcite&rsquo;s cost-based optimizer applies transformation rules and optimizes the logical plans. Depending on the nature of the sources (streaming or static) we use different rule sets. Finally, the optimized plan is translated into a regular Flink DataStream or DataSet program. This step involves again code generation to compile relational expressions into Flink functions.</p>
<p>The new architecture of the Table API maintains the basic principles of the original Table API and improves it. It keeps a uniform interface for relational queries on streaming and static data. In addition, we take advantage of Calcite&rsquo;s query optimization framework and SQL parser. The design builds upon Flink&rsquo;s established APIs, i.e., the DataStream API that offers low-latency, high-throughput stream processing with exactly-once semantics and consistent results due to event-time processing, and the DataSet API with robust and efficient in-memory operators and pipelined data exchange. Any improvements to Flink&rsquo;s core APIs and engine will automatically improve the execution of Table API and SQL queries.</p>
<p>With this effort, we are adding SQL support for both streaming and static data to Flink. However, we do not want to see this as a competing solution to dedicated, high-performance SQL-on-Hadoop solutions, such as Impala, Drill, and Hive. Instead, we see the sweet spot of Flink&rsquo;s SQL integration primarily in providing access to streaming analytics to a wider audience. In addition, it will facilitate integrated applications that use Flink&rsquo;s API&rsquo;s as well as SQL while being executed on a single runtime engine.</p>
<h2 id="how-will-flinks-sql-on-streams-look-like">
How will Flink&rsquo;s SQL on streams look like?
<a class="anchor" href="#how-will-flinks-sql-on-streams-look-like">#</a>
</h2>
<p>So far we discussed the motivation for and architecture of Flink&rsquo;s stream SQL interface, but how will it actually look like? The new SQL interface is integrated into the Table API. DataStreams, DataSets, and external data sources can be registered as tables at the <code>TableEnvironment</code> in order to make them queryable with SQL. The <code>TableEnvironment.sql()</code> method states a SQL query and returns its result as a Table. The following example shows a complete program that reads a streaming table from a JSON encoded Kafka topic, processes it with a SQL query and writes the resulting stream into another Kafka topic. Please note that the KafkaJsonSource and KafkaJsonSink are under development and not available yet. In the future, TableSources and TableSinks can be persisted to and loaded from files to ease reuse of source and sink definitions and to reduce boilerplate code.</p>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-scala" data-lang="scala"><span class="line"><span class="cl"><span class="c1">// get environments
</span></span></span><span class="line"><span class="cl"><span class="c1"></span><span class="k">val</span> <span class="n">execEnv</span> <span class="k">=</span> <span class="nc">StreamExecutionEnvironment</span><span class="o">.</span><span class="n">getExecutionEnvironment</span>
</span></span><span class="line"><span class="cl"><span class="k">val</span> <span class="n">tableEnv</span> <span class="k">=</span> <span class="nc">TableEnvironment</span><span class="o">.</span><span class="n">getTableEnvironment</span><span class="o">(</span><span class="n">execEnv</span><span class="o">)</span>
</span></span><span class="line"><span class="cl">
</span></span><span class="line"><span class="cl"><span class="c1">// configure Kafka connection
</span></span></span><span class="line"><span class="cl"><span class="c1"></span><span class="k">val</span> <span class="n">kafkaProps</span> <span class="k">=</span> <span class="o">...</span>
</span></span><span class="line"><span class="cl"><span class="c1">// define a JSON encoded Kafka topic as external table
</span></span></span><span class="line"><span class="cl"><span class="c1"></span><span class="k">val</span> <span class="n">sensorSource</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">KafkaJsonSource</span><span class="o">[(</span><span class="kt">String</span>, <span class="kt">Long</span>, <span class="kt">Double</span><span class="o">)](</span>
</span></span><span class="line"><span class="cl"> <span class="s">&#34;sensorTopic&#34;</span><span class="o">,</span>
</span></span><span class="line"><span class="cl"> <span class="n">kafkaProps</span><span class="o">,</span>
</span></span><span class="line"><span class="cl"> <span class="o">(</span><span class="s">&#34;location&#34;</span><span class="o">,</span> <span class="s">&#34;time&#34;</span><span class="o">,</span> <span class="s">&#34;tempF&#34;</span><span class="o">))</span>
</span></span><span class="line"><span class="cl">
</span></span><span class="line"><span class="cl"><span class="c1">// register external table
</span></span></span><span class="line"><span class="cl"><span class="c1"></span><span class="n">tableEnv</span><span class="o">.</span><span class="n">registerTableSource</span><span class="o">(</span><span class="s">&#34;sensorData&#34;</span><span class="o">,</span> <span class="n">sensorSource</span><span class="o">)</span>
</span></span><span class="line"><span class="cl">
</span></span><span class="line"><span class="cl"><span class="c1">// define query in external table
</span></span></span><span class="line"><span class="cl"><span class="c1"></span><span class="k">val</span> <span class="n">roomSensors</span><span class="k">:</span> <span class="kt">Table</span> <span class="o">=</span> <span class="n">tableEnv</span><span class="o">.</span><span class="n">sql</span><span class="o">(</span>
</span></span><span class="line"><span class="cl"> <span class="s">&#34;SELECT STREAM time, location AS room, (tempF - 32) * 0.556 AS tempC &#34;</span> <span class="o">+</span>
</span></span><span class="line"><span class="cl"> <span class="s">&#34;FROM sensorData &#34;</span> <span class="o">+</span>
</span></span><span class="line"><span class="cl"> <span class="s">&#34;WHERE location LIKE &#39;room%&#39;&#34;</span>
</span></span><span class="line"><span class="cl"> <span class="o">)</span>
</span></span><span class="line"><span class="cl">
</span></span><span class="line"><span class="cl"><span class="c1">// define a JSON encoded Kafka topic as external sink
</span></span></span><span class="line"><span class="cl"><span class="c1"></span><span class="k">val</span> <span class="n">roomSensorSink</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">KafkaJsonSink</span><span class="o">(...)</span>
</span></span><span class="line"><span class="cl">
</span></span><span class="line"><span class="cl"><span class="c1">// define sink for room sensor data and execute query
</span></span></span><span class="line"><span class="cl"><span class="c1"></span><span class="n">roomSensors</span><span class="o">.</span><span class="n">toSink</span><span class="o">(</span><span class="n">roomSensorSink</span><span class="o">)</span>
</span></span><span class="line"><span class="cl"><span class="n">execEnv</span><span class="o">.</span><span class="n">execute</span><span class="o">()</span>
</span></span></code></pre></div><p>You might have noticed that this example left out the most interesting aspects of stream data processing: window aggregates and joins. How will these operations be expressed in SQL? Well, that is a very good question. The Apache Calcite community put out an excellent proposal that discusses the syntax and semantics of <a href="https://calcite.apache.org/docs/stream.html">SQL on streams</a>. It describes Calcite’s stream SQL as <em>&ldquo;an extension to standard SQL, not another ‘SQL-like’ language&rdquo;</em>. This has several benefits. First, people who are familiar with standard SQL will be able to analyze data streams without learning a new syntax. Queries on static tables and streams are (almost) identical and can be easily ported. Moreover it is possible to specify queries that reference static and streaming tables at the same time which goes well together with Flink’s vision to handle batch processing as a special case of stream processing, i.e., as processing finite streams. Finally, using standard SQL for stream data analysis means following a well established standard that is supported by many tools.</p>
<p>Although we haven’t completely fleshed out the details of how windows will be defined in Flink’s SQL syntax and Table API, the following examples show how a tumbling window query could look like in SQL and the Table API.</p>
<h3 id="sql-following-the-syntax-proposal-of-calcites-streaming-sql-document">
SQL (following the syntax proposal of Calcite’s streaming SQL document)
<a class="anchor" href="#sql-following-the-syntax-proposal-of-calcites-streaming-sql-document">#</a>
</h3>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-sql" data-lang="sql"><span class="line"><span class="cl"><span class="k">SELECT</span><span class="w"> </span><span class="n">STREAM</span><span class="w">
</span></span></span><span class="line"><span class="cl"><span class="w"> </span><span class="n">TUMBLE_END</span><span class="p">(</span><span class="n">time</span><span class="p">,</span><span class="w"> </span><span class="nb">INTERVAL</span><span class="w"> </span><span class="s1">&#39;1&#39;</span><span class="w"> </span><span class="k">DAY</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="k">day</span><span class="p">,</span><span class="w">
</span></span></span><span class="line"><span class="cl"><span class="w"> </span><span class="k">location</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">room</span><span class="p">,</span><span class="w">
</span></span></span><span class="line"><span class="cl"><span class="w"> </span><span class="k">AVG</span><span class="p">((</span><span class="n">tempF</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="mi">32</span><span class="p">)</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="mi">0</span><span class="p">.</span><span class="mi">556</span><span class="p">)</span><span class="w"> </span><span class="k">AS</span><span class="w"> </span><span class="n">avgTempC</span><span class="w">
</span></span></span><span class="line"><span class="cl"><span class="w"></span><span class="k">FROM</span><span class="w"> </span><span class="n">sensorData</span><span class="w">
</span></span></span><span class="line"><span class="cl"><span class="w"></span><span class="k">WHERE</span><span class="w"> </span><span class="k">location</span><span class="w"> </span><span class="k">LIKE</span><span class="w"> </span><span class="s1">&#39;room%&#39;</span><span class="w">
</span></span></span><span class="line"><span class="cl"><span class="w"></span><span class="k">GROUP</span><span class="w"> </span><span class="k">BY</span><span class="w"> </span><span class="n">TUMBLE</span><span class="p">(</span><span class="n">time</span><span class="p">,</span><span class="w"> </span><span class="nb">INTERVAL</span><span class="w"> </span><span class="s1">&#39;1&#39;</span><span class="w"> </span><span class="k">DAY</span><span class="p">),</span><span class="w"> </span><span class="k">location</span><span class="w">
</span></span></span></code></pre></div><h3 id="table-api">
Table API
<a class="anchor" href="#table-api">#</a>
</h3>
<div class="highlight"><pre tabindex="0" class="chroma"><code class="language-scala" data-lang="scala"><span class="line"><span class="cl"><span class="k">val</span> <span class="n">avgRoomTemp</span><span class="k">:</span> <span class="kt">Table</span> <span class="o">=</span> <span class="n">tableEnv</span><span class="o">.</span><span class="n">ingest</span><span class="o">(</span><span class="s">&#34;sensorData&#34;</span><span class="o">)</span>
</span></span><span class="line"><span class="cl"> <span class="o">.</span><span class="n">where</span><span class="o">(</span>&#39;location<span class="o">.</span><span class="n">like</span><span class="o">(</span><span class="s">&#34;room%&#34;</span><span class="o">))</span>
</span></span><span class="line"><span class="cl"> <span class="o">.</span><span class="n">partitionBy</span><span class="o">(</span>&#39;location<span class="o">)</span>
</span></span><span class="line"><span class="cl"> <span class="o">.</span><span class="n">window</span><span class="o">(</span><span class="nc">Tumbling</span> <span class="n">every</span> <span class="nc">Days</span><span class="o">(</span><span class="mi">1</span><span class="o">)</span> <span class="n">on</span> &#39;time <span class="n">as</span> &#39;w<span class="o">)</span>
</span></span><span class="line"><span class="cl"> <span class="o">.</span><span class="n">select</span><span class="o">(</span>&#39;w<span class="o">.</span><span class="n">end</span><span class="o">,</span> &#39;location<span class="o">,</span> <span class="o">,</span> <span class="o">((</span>&#39;tempF <span class="o">-</span> <span class="mi">32</span><span class="o">)</span> <span class="o">*</span> <span class="mf">0.556</span><span class="o">).</span><span class="n">avg</span> <span class="n">as</span> &#39;avgTempCs<span class="o">)</span>
</span></span></code></pre></div><h2 id="whats-up-next">
What&rsquo;s up next?
<a class="anchor" href="#whats-up-next">#</a>
</h2>
<p>The Flink community is actively working on SQL support for the next minor version Flink 1.1.0. In the first version, SQL (and Table API) queries on streams will be limited to selection, filter, and union operators. Compared to Flink 1.0.0, the revised Table API will support many more scalar functions and be able to read tables from external sources and write them back to external sinks. A lot of work went into reworking the architecture of the Table API and integrating Apache Calcite.</p>
<p>In Flink 1.2.0, the feature set of SQL on streams will be significantly extended. Among other things, we plan to support different types of window aggregates and maybe also streaming joins. For this effort, we want to closely collaborate with the Apache Calcite community and help extending Calcite&rsquo;s support for relational operations on streaming data when necessary.</p>
<p>If this post made you curious and you want to try out Flink’s SQL interface and the new Table API, we encourage you to do so! Simply clone the SNAPSHOT <a href="https://github.com/apache/flink/tree/master">master branch</a> and check out the <a href="//nightlies.apache.org/flink/flink-docs-master/apis/table.html">Table API documentation for the SNAPSHOT version</a>. Please note that the branch is under heavy development, and hence some code examples in this blog post might not work. We are looking forward to your feedback and welcome contributions.</p>
</p>
</article>
<div class="edit-this-page">
<p>
<a href="https://cwiki.apache.org/confluence/display/FLINK/Flink+Translation+Specifications">Want to contribute translation?</a>
</p>
<p>
<a href="//github.com/apache/flink-web/edit/asf-site/docs/content/posts/2016-05-24-stream-sql.md">
Edit This Page<i class="fa fa-edit fa-fw"></i>
</a>
</p>
</div>
</section>
<aside class="book-toc">
<nav id="TableOfContents"><h3>On This Page <a href="javascript:void(0)" class="toc" onclick="collapseToc()"><i class="fa fa-times" aria-hidden="true"></i></a></h3>
<ul>
<li>
<ul>
<li><a href="#where-did-we-come-from">Where did we come from?</a></li>
<li><a href="#table-api-joining-forces-with-sql">Table API joining forces with SQL</a></li>
<li><a href="#how-will-flinks-sql-on-streams-look-like">How will Flink&rsquo;s SQL on streams look like?</a>
<ul>
<li><a href="#sql-following-the-syntax-proposal-of-calcites-streaming-sql-document">SQL (following the syntax proposal of Calcite’s streaming SQL document)</a></li>
<li><a href="#table-api">Table API</a></li>
</ul>
</li>
<li><a href="#whats-up-next">What&rsquo;s up next?</a></li>
</ul>
</li>
</ul>
</nav>
</aside>
<aside class="expand-toc hidden">
<a class="toc" onclick="expandToc()" href="javascript:void(0)">
<i class="fa fa-bars" aria-hidden="true"></i>
</a>
</aside>
</main>
<footer>
<div class="separator"></div>
<div class="panels">
<div class="wrapper">
<div class="panel">
<ul>
<li>
<a href="https://flink-packages.org/">flink-packages.org</a>
</li>
<li>
<a href="https://www.apache.org/">Apache Software Foundation</a>
</li>
<li>
<a href="https://www.apache.org/licenses/">License</a>
</li>
<li>
<a href="/zh/">
<i class="fa fa-globe" aria-hidden="true"></i>&nbsp;中文版
</a>
</li>
</ul>
</div>
<div class="panel">
<ul>
<li>
<a href="/what-is-flink/security">Security</a-->
</li>
<li>
<a href="https://www.apache.org/foundation/sponsorship.html">Donate</a>
</li>
<li>
<a href="https://www.apache.org/foundation/thanks.html">Thanks</a>
</li>
</ul>
</div>
<div class="panel icons">
<div>
<a href="/posts">
<div class="icon flink-blog-icon"></div>
<span>Flink blog</span>
</a>
</div>
<div>
<a href="https://github.com/apache/flink">
<div class="icon flink-github-icon"></div>
<span>Github</span>
</a>
</div>
<div>
<a href="https://twitter.com/apacheflink">
<div class="icon flink-twitter-icon"></div>
<span>Twitter</span>
</a>
</div>
</div>
</div>
</div>
<hr/>
<div class="container disclaimer">
<p>The contents of this website are © 2024 Apache Software Foundation under the terms of the Apache License v2. Apache Flink, Flink, and the Flink logo are either registered trademarks or trademarks of The Apache Software Foundation in the United States and other countries.</p>
</div>
</footer>
</body>
</html>