blob: 684d0eef27382053cdda37ce4924f93dbb8002fb [file] [log] [blame]
<!DOCTYPE html><html lang="en"><head><meta charSet="utf-8"/><meta http-equiv="X-UA-Compatible" content="IE=edge"/><title>File source connector · Apache Pulsar</title><meta name="viewport" content="width=device-width, initial-scale=1.0"/><meta name="generator" content="Docusaurus"/><meta name="description" content="The File source connector pulls messages from files in directories and persists the messages to Pulsar topics."/><meta name="docsearch:version" content="next"/><meta name="docsearch:language" content="en"/><meta property="og:title" content="File source connector · Apache Pulsar"/><meta property="og:type" content="website"/><meta property="og:url" content="https://pulsar.apache.org/"/><meta property="og:description" content="The File source connector pulls messages from files in directories and persists the messages to Pulsar topics."/><meta name="twitter:card" content="summary"/><meta name="twitter:image" content="https://pulsar.apache.org/img/pulsar.svg"/><link rel="shortcut icon" href="/img/pulsar.ico"/><link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/styles/atom-one-dark.min.css"/><link rel="alternate" type="application/atom+xml" href="https://pulsar.apache.org/blog/atom.xml" title="Apache Pulsar Blog ATOM Feed"/><link rel="alternate" type="application/rss+xml" href="https://pulsar.apache.org/blog/feed.xml" title="Apache Pulsar Blog RSS Feed"/><link rel="stylesheet" href="/css/code-blocks-buttons.css"/><script type="text/javascript" src="https://buttons.github.io/buttons.js"></script><script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.0/clipboard.min.js"></script><script type="text/javascript" src="/js/custom.js"></script><script src="/js/scrollSpy.js"></script><link rel="stylesheet" href="/css/main.css"/><script src="/js/codetabs.js"></script></head><body class="sideNavVisible separateOnPageNav"><div class="fixedHeaderContainer"><div class="headerWrapper wrapper"><header><a href="/en"><img class="logo" src="/img/pulsar.svg" alt="Apache Pulsar"/></a><a href="/en/versions"><h3>next</h3></a><div class="navigationWrapper navigationSlider"><nav class="slidingNav"><ul class="nav-site nav-site-internal"><li class=""><a href="/docs/en/next/getting-started-standalone" target="_self">Docs</a></li><li class=""><a href="/en/download" target="_self">Download</a></li><li class=""><a href="/docs/en/next/client-libraries" target="_self">Clients</a></li><li class=""><a href="#restapis" target="_self">REST APIs</a></li><li class=""><a href="#cli" target="_self">Cli</a></li><li class=""><a href="/blog/" target="_self">Blog</a></li><li class=""><a href="#community" target="_self">Community</a></li><li class=""><a href="#apache" target="_self">Apache</a></li><li class=""><a href="https://pulsar-next.staged.apache.org/" target="_self">New Website (Beta)</a></li><span><li><a id="languages-menu" href="#"><img class="languages-icon" src="/img/language.svg" alt="Languages icon"/>English</a><div id="languages-dropdown" class="hide"><ul id="languages-dropdown-items"><li><a href="/docs/ja/next/io-file-source">日本語</a></li><li><a href="/docs/fr/next/io-file-source">Français</a></li><li><a href="/docs/ko/next/io-file-source">한국어</a></li><li><a href="/docs/zh-CN/next/io-file-source">中文</a></li><li><a href="/docs/zh-TW/next/io-file-source">繁體中文</a></li><li><a href="https://crowdin.com/project/apache-pulsar" target="_blank" rel="noreferrer noopener">Help Translate</a></li></ul></div></li><script>
const languagesMenuItem = document.getElementById("languages-menu");
const languagesDropDown = document.getElementById("languages-dropdown");
languagesMenuItem.addEventListener("click", function(event) {
event.preventDefault();
if (languagesDropDown.className == "hide") {
languagesDropDown.className = "visible";
} else {
languagesDropDown.className = "hide";
}
});
</script></span></ul></nav></div></header></div></div><div class="navPusher"><div class="docMainWrapper wrapper"><div class="container mainContainer docsContainer"><div class="wrapper"><div class="post"><header class="postHeader"><a class="edit-page-link button" href="https://github.com/apache/pulsar/edit/master/site2/docs/io-file-source.md" target="_blank" rel="noreferrer noopener">Edit</a><h1 id="__docusaurus" class="postHeaderTitle">File source connector</h1></header><article><div><span><p>The File source connector pulls messages from files in directories and persists the messages to Pulsar topics.</p>
<h2><a class="anchor" aria-hidden="true" id="configuration"></a><a href="#configuration" aria-hidden="true" class="hash-link"><svg class="hash-link-icon" aria-hidden="true" height="16" version="1.1" viewBox="0 0 16 16" width="16"><path fill-rule="evenodd" d="M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"></path></svg></a>Configuration</h2>
<p>The configuration of the File source connector has the following properties.</p>
<h3><a class="anchor" aria-hidden="true" id="property"></a><a href="#property" aria-hidden="true" class="hash-link"><svg class="hash-link-icon" aria-hidden="true" height="16" version="1.1" viewBox="0 0 16 16" width="16"><path fill-rule="evenodd" d="M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"></path></svg></a>Property</h3>
<table>
<thead>
<tr><th>Name</th><th>Type</th><th>Required</th><th>Default</th><th>Description</th></tr>
</thead>
<tbody>
<tr><td><code>inputDirectory</code></td><td>String</td><td>true</td><td>No default value</td><td>The input directory to pull files.</td></tr>
<tr><td><code>recurse</code></td><td>Boolean</td><td>false</td><td>true</td><td>Whether to pull files from subdirectory or not.</td></tr>
<tr><td><code>keepFile</code></td><td>Boolean</td><td>false</td><td>false</td><td>If set to true, the file is not deleted after it is processed, which means the file can be picked up continually.</td></tr>
<tr><td><code>fileFilter</code></td><td>String</td><td>false</td><td>[^\.].*</td><td>The file whose name matches the given regular expression is picked up.</td></tr>
<tr><td><code>pathFilter</code></td><td>String</td><td>false</td><td>NULL</td><td>If <code>recurse</code> is set to true, the subdirectory whose path matches the given regular expression is scanned.</td></tr>
<tr><td><code>minimumFileAge</code></td><td>Integer</td><td>false</td><td>0</td><td>The minimum age that a file can be processed. <br><br>Any file younger than <code>minimumFileAge</code> (according to the last modification date) is ignored.</td></tr>
<tr><td><code>maximumFileAge</code></td><td>Long</td><td>false</td><td>Long.MAX_VALUE</td><td>The maximum age that a file can be processed. <br><br>Any file older than <code>maximumFileAge</code> (according to last modification date) is ignored.</td></tr>
<tr><td><code>minimumSize</code></td><td>Integer</td><td>false</td><td>1</td><td>The minimum size (in bytes) that a file can be processed.</td></tr>
<tr><td><code>maximumSize</code></td><td>Double</td><td>false</td><td>Double.MAX_VALUE</td><td>The maximum size (in bytes) that a file can be processed.</td></tr>
<tr><td><code>ignoreHiddenFiles</code></td><td>Boolean</td><td>false</td><td>true</td><td>Whether the hidden files should be ignored or not.</td></tr>
<tr><td><code>pollingInterval</code></td><td>Long</td><td>false</td><td>10000L</td><td>Indicates how long to wait before performing a directory listing.</td></tr>
<tr><td><code>numWorkers</code></td><td>Integer</td><td>false</td><td>1</td><td>The number of worker threads that process files.<br><br> This allows you to process a larger number of files concurrently. <br><br>However, setting this to a value greater than 1 makes the data from multiple files mixed in the target topic.</td></tr>
<tr><td><code>processedFileSuffix</code></td><td>String</td><td>false</td><td>NULL</td><td>If set, do not delete but only rename file that has been processed. <br><br> This config only work when 'keepFile' property is false.</td></tr>
</tbody>
</table>
<h3><a class="anchor" aria-hidden="true" id="example"></a><a href="#example" aria-hidden="true" class="hash-link"><svg class="hash-link-icon" aria-hidden="true" height="16" version="1.1" viewBox="0 0 16 16" width="16"><path fill-rule="evenodd" d="M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"></path></svg></a>Example</h3>
<p>Before using the File source connector, you need to create a configuration file through one of the following methods.</p>
<ul>
<li><p>JSON</p>
<pre><code class="hljs css language-json">{
<span class="hljs-attr">"configs"</span>: {
<span class="hljs-attr">"inputDirectory"</span>: <span class="hljs-string">"/Users/david"</span>,
<span class="hljs-attr">"recurse"</span>: <span class="hljs-literal">true</span>,
<span class="hljs-attr">"keepFile"</span>: <span class="hljs-literal">true</span>,
<span class="hljs-attr">"fileFilter"</span>: <span class="hljs-string">"[^\\.].*"</span>,
<span class="hljs-attr">"pathFilter"</span>: <span class="hljs-string">"*"</span>,
<span class="hljs-attr">"minimumFileAge"</span>: <span class="hljs-number">0</span>,
<span class="hljs-attr">"maximumFileAge"</span>: <span class="hljs-number">9999999999</span>,
<span class="hljs-attr">"minimumSize"</span>: <span class="hljs-number">1</span>,
<span class="hljs-attr">"maximumSize"</span>: <span class="hljs-number">5000000</span>,
<span class="hljs-attr">"ignoreHiddenFiles"</span>: <span class="hljs-literal">true</span>,
<span class="hljs-attr">"pollingInterval"</span>: <span class="hljs-number">5000</span>,
<span class="hljs-attr">"numWorkers"</span>: <span class="hljs-number">1</span>,
<span class="hljs-attr">"processedFileSuffix"</span>: <span class="hljs-string">".processed_done"</span>
}
}
</code></pre></li>
<li><p>YAML</p>
<pre><code class="hljs css language-yaml"><span class="hljs-attr">configs:</span>
<span class="hljs-attr">inputDirectory:</span> <span class="hljs-string">"/Users/david"</span>
<span class="hljs-attr">recurse:</span> <span class="hljs-literal">true</span>
<span class="hljs-attr">keepFile:</span> <span class="hljs-literal">true</span>
<span class="hljs-attr">fileFilter:</span> <span class="hljs-string">"[^\\.].*"</span>
<span class="hljs-attr">pathFilter:</span> <span class="hljs-string">"*"</span>
<span class="hljs-attr">minimumFileAge:</span> <span class="hljs-number">0</span>
<span class="hljs-attr">maximumFileAge:</span> <span class="hljs-number">9999999999</span>
<span class="hljs-attr">minimumSize:</span> <span class="hljs-number">1</span>
<span class="hljs-attr">maximumSize:</span> <span class="hljs-number">5000000</span>
<span class="hljs-attr">ignoreHiddenFiles:</span> <span class="hljs-literal">true</span>
<span class="hljs-attr">pollingInterval:</span> <span class="hljs-number">5000</span>
<span class="hljs-attr">numWorkers:</span> <span class="hljs-number">1</span>
<span class="hljs-attr">processedFileSuffix:</span> <span class="hljs-string">".processed_done"</span>
</code></pre></li>
</ul>
<h2><a class="anchor" aria-hidden="true" id="usage"></a><a href="#usage" aria-hidden="true" class="hash-link"><svg class="hash-link-icon" aria-hidden="true" height="16" version="1.1" viewBox="0 0 16 16" width="16"><path fill-rule="evenodd" d="M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"></path></svg></a>Usage</h2>
<p>Here is an example of using the File source connecter.</p>
<ol>
<li><p>Pull a Pulsar image.</p>
<pre><code class="hljs css language-bash">$ docker pull apachepulsar/pulsar:{version}
</code></pre></li>
<li><p>Start Pulsar standalone.</p>
<pre><code class="hljs css language-bash">$ docker run -d -it -p 6650:6650 -p 8080:8080 -v <span class="hljs-variable">$PWD</span>/data:/pulsar/data --name pulsar-standalone apachepulsar/pulsar:{version} bin/pulsar standalone
</code></pre></li>
<li><p>Create a configuration file <em>file-connector.yaml</em>.</p>
<pre><code class="hljs css language-yaml"><span class="hljs-attr">configs:</span>
<span class="hljs-attr">inputDirectory:</span> <span class="hljs-string">"/opt"</span>
</code></pre></li>
<li><p>Copy the configuration file <em>file-connector.yaml</em> to the container.</p>
<pre><code class="hljs css language-bash">$ docker cp connectors/file-connector.yaml pulsar-standalone:/pulsar/
</code></pre></li>
<li><p>Download the File source connector.</p>
<pre><code class="hljs css language-bash">$ curl -O https://mirrors.tuna.tsinghua.edu.cn/apache/pulsar/pulsar-{version}/connectors/pulsar-io-file-{version}.nar
</code></pre></li>
<li><p>Copy it to the <code>connectors</code> folder, then restart the container.</p>
<pre><code class="hljs css language-bash">$ docker cp pulsar-io-file-{version}.nar pulsar-standalone:/pulsar/connectors/
$ docker restart pulsar-standalone
</code></pre></li>
<li><p>Start the File source connector.</p>
<pre><code class="hljs css language-bash">$ docker <span class="hljs-built_in">exec</span> -it pulsar-standalone /bin/bash
$ ./bin/pulsar-admin sources localrun \
--archive /pulsar/connectors/pulsar-io-file-{version}.nar \
--name file-test \
--destination-topic-name pulsar-file-test \
--<span class="hljs-built_in">source</span>-config-file /pulsar/file-connector.yaml
</code></pre></li>
<li><p>Start a consumer.</p>
<pre><code class="hljs css language-bash">./bin/pulsar-client consume -s file-test -n 0 pulsar-file-test
</code></pre></li>
<li><p>Write the message to the file <em>test.txt</em>.</p>
<pre><code class="hljs css language-bash"><span class="hljs-built_in">echo</span> <span class="hljs-string">"hello world!"</span> &gt; /opt/test.txt
</code></pre>
<p>The following information appears on the consumer terminal window.</p>
<pre><code class="hljs css language-bash">----- got message -----
hello world!
</code></pre></li>
</ol>
</span></div></article></div><div class="docs-prevnext"></div></div></div><nav class="onPageNav"><ul class="toc-headings"><li><a href="#configuration">Configuration</a><ul class="toc-headings"><li><a href="#property">Property</a></li><li><a href="#example">Example</a></li></ul></li><li><a href="#usage">Usage</a></li></ul></nav></div><footer class="nav-footer" id="footer"><section class="copyright">Copyright © 2022 The Apache Software Foundation. All Rights Reserved. Apache, Apache Pulsar and the Apache feather logo are trademarks of The Apache Software Foundation.</section><span><script>
const community = document.querySelector("a[href='#community']").parentNode;
const communityMenu =
'<li>' +
'<a id="community-menu" href="#">Community <span style="font-size: 0.75em">&nbsp;▼</span></a>' +
'<div id="community-dropdown" class="hide">' +
'<ul id="community-dropdown-items">' +
'<li><a href="/en/contact">Contact</a></li>' +
'<li><a href="/en/contributing">Contributing</a></li>' +
'<li><a href="/en/coding-guide">Coding guide</a></li>' +
'<li><a href="/en/events">Events</a></li>' +
'<li><a href="https://twitter.com/Apache_Pulsar" target="_blank">Twitter &#x2750</a></li>' +
'<li><a href="https://github.com/apache/pulsar/wiki" target="_blank">Wiki &#x2750</a></li>' +
'<li><a href="https://github.com/apache/pulsar/issues" target="_blank">Issue tracking &#x2750</a></li>' +
'<li><a href="https://pulsar-summit.org/" target="_blank">Pulsar Summit &#x2750</a></li>' +
'<li>&nbsp;</li>' +
'<li><a href="/en/resources">Resources</a></li>' +
'<li><a href="/en/team">Team</a></li>' +
'<li><a href="/en/powered-by">Powered By</a></li>' +
'</ul>' +
'</div>' +
'</li>';
community.innerHTML = communityMenu;
const communityMenuItem = document.getElementById("community-menu");
const communityDropDown = document.getElementById("community-dropdown");
communityMenuItem.addEventListener("click", function(event) {
event.preventDefault();
if (communityDropDown.className == 'hide') {
communityDropDown.className = 'visible';
} else {
communityDropDown.className = 'hide';
}
});
</script></span></footer></div><script>window.twttr=(function(d,s, id){var js,fjs=d.getElementsByTagName(s)[0],t=window.twttr||{};if(d.getElementById(id))return t;js=d.createElement(s);js.id=id;js.src='https://platform.twitter.com/widgets.js';fjs.parentNode.insertBefore(js, fjs);t._e = [];t.ready = function(f) {t._e.push(f);};return t;}(document, 'script', 'twitter-wjs'));</script></body></html>