blob: de806f955500628454eda2024186b8fcc2291817 [file] [log] [blame]
"use strict";(self.webpackChunkwebsite=self.webpackChunkwebsite||[]).push([[1690],{4137:function(e,t,a){a.d(t,{Zo:function(){return c},kt:function(){return m}});var n=a(7294);function o(e,t,a){return t in e?Object.defineProperty(e,t,{value:a,enumerable:!0,configurable:!0,writable:!0}):e[t]=a,e}function i(e,t){var a=Object.keys(e);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(e);t&&(n=n.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),a.push.apply(a,n)}return a}function l(e){for(var t=1;t<arguments.length;t++){var a=null!=arguments[t]?arguments[t]:{};t%2?i(Object(a),!0).forEach((function(t){o(e,t,a[t])})):Object.getOwnPropertyDescriptors?Object.defineProperties(e,Object.getOwnPropertyDescriptors(a)):i(Object(a)).forEach((function(t){Object.defineProperty(e,t,Object.getOwnPropertyDescriptor(a,t))}))}return e}function r(e,t){if(null==e)return{};var a,n,o=function(e,t){if(null==e)return{};var a,n,o={},i=Object.keys(e);for(n=0;n<i.length;n++)a=i[n],t.indexOf(a)>=0||(o[a]=e[a]);return o}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(n=0;n<i.length;n++)a=i[n],t.indexOf(a)>=0||Object.prototype.propertyIsEnumerable.call(e,a)&&(o[a]=e[a])}return o}var s=n.createContext({}),p=function(e){var t=n.useContext(s),a=t;return e&&(a="function"==typeof e?e(t):l(l({},t),e)),a},c=function(e){var t=p(e.components);return n.createElement(s.Provider,{value:t},e.children)},h="mdxType",u={inlineCode:"code",wrapper:function(e){var t=e.children;return n.createElement(n.Fragment,{},t)}},d=n.forwardRef((function(e,t){var a=e.components,o=e.mdxType,i=e.originalType,s=e.parentName,c=r(e,["components","mdxType","originalType","parentName"]),h=p(a),d=o,m=h["".concat(s,".").concat(d)]||h[d]||u[d]||i;return a?n.createElement(m,l(l({ref:t},c),{},{components:a})):n.createElement(m,l({ref:t},c))}));function m(e,t){var a=arguments,o=t&&t.mdxType;if("string"==typeof e||o){var i=a.length,l=new Array(i);l[0]=d;var r={};for(var s in t)hasOwnProperty.call(t,s)&&(r[s]=t[s]);r.originalType=e,r[h]="string"==typeof e?e:o,l[1]=r;for(var p=2;p<i;p++)l[p]=a[p];return n.createElement.apply(null,l)}return n.createElement.apply(null,a)}d.displayName="MDXCreateElement"},5340:function(e,t,a){a.r(t),a.d(t,{contentTitle:function(){return s},default:function(){return d},frontMatter:function(){return r},metadata:function(){return p},toc:function(){return c}});var n=a(7462),o=a(3366),i=(a(7294),a(4137)),l=["components"],r={title:"Apache Pinot Tutorial for Getting Started - A Step-by-Step Guide",author:"Barkha Herman",author_title:"Developer Advocate",author_url:"https://pinot.apache.org/",author_image_url:"https://pinot.apache.org/authors/pinot_team.jpg",description:"This blog post is a guide to getting started with Apache Pinot, including installing and running the necessary components of a Pinot cluster. It also explains how to set up schemas, tables, and load data into Pinot, as well as how to run queries using the Pinot Data Explorer. The next article in the series will cover consuming event streaming data with Apache Pinot and Apache Kafka.",keywords:["Apache Pinot","getting started with Pinot","DataExplorer","streaming data","Apache Kafka"],tags:["Pinot","Data","Analytics","User-Facing Analytics","data explorer","getting started","streaming","kafka"]},s=void 0,p={permalink:"/blog/2023/05/18/apache-pinot-tutorial-for-getting-started-a-step-by-step-guide",editUrl:"https://github.com/apache/pinot-site/edit/dev/website/blog/2023-05-18-apache-pinot-tutorial-for-getting-started-a-step-by-step-guide.md",source:"@site/blog/2023-05-18-apache-pinot-tutorial-for-getting-started-a-step-by-step-guide.md",title:"Apache Pinot Tutorial for Getting Started - A Step-by-Step Guide",description:"This blog post is a guide to getting started with Apache Pinot, including installing and running the necessary components of a Pinot cluster. It also explains how to set up schemas, tables, and load data into Pinot, as well as how to run queries using the Pinot Data Explorer. The next article in the series will cover consuming event streaming data with Apache Pinot and Apache Kafka.",date:"2023-05-18T00:00:00.000Z",formattedDate:"May 18, 2023",tags:[{label:"Pinot",permalink:"/blog/tags/pinot"},{label:"Data",permalink:"/blog/tags/data"},{label:"Analytics",permalink:"/blog/tags/analytics"},{label:"User-Facing Analytics",permalink:"/blog/tags/user-facing-analytics"},{label:"data explorer",permalink:"/blog/tags/data-explorer"},{label:"getting started",permalink:"/blog/tags/getting-started"},{label:"streaming",permalink:"/blog/tags/streaming"},{label:"kafka",permalink:"/blog/tags/kafka"}],readingTime:7.91,truncated:!1,prevItem:{title:"Change Data Capture with Apache Pinot - How Does It Work?",permalink:"/blog/2023/05/23/change-data-capture-with-apache-pinot-how-does-it-work"},nextItem:{title:"StarTree Indexes in Apache Pinot Part-1 - Understanding the Impact on Query Performance",permalink:"/blog/2023/05/16/star-tree-indexes-in-apache-pinot-part-1-understanding-the-impact-on-query-performance"}},c=[{value:"The Obligatory What is Apache Pinot and StarTree Section",id:"the-obligatory-what-is-apache-pinot-and-startree-section",children:[]},{value:"What do you need to run Apache Pinot?",id:"what-do-you-need-to-run-apache-pinot",children:[]},{value:"Step-by-step installation of Apache Pinot",id:"step-by-step-installation-of-apache-pinot",children:[{value:"Step 1:\xa0",id:"step-1",children:[]},{value:"Step 2:",id:"step-2",children:[]},{value:"Step 3:",id:"step-3",children:[]},{value:"Step 4:",id:"step-4",children:[]},{value:"Step 5:",id:"step-5",children:[]}]},{value:"What\u2019s under the hood?",id:"whats-under-the-hood",children:[]},{value:"Conclusion",id:"conclusion",children:[]}],h={toc:c},u="wrapper";function d(e){var t=e.components,a=(0,o.Z)(e,l);return(0,i.kt)(u,(0,n.Z)({},h,a,{components:t,mdxType:"MDXLayout"}),(0,i.kt)("p",null,"How do you get started with ",(0,i.kt)("a",{parentName:"p",href:"https://startree.ai/resources/what-is-apache-pinot"},"Apache Pinot\u2122"),"? Good question! To save you the hassle of trying to tackle this on your own, here\u2019s a handy guide that overviews all of the components that make up Pinot and how to set Pinot up."),(0,i.kt)("h2",{id:"the-obligatory-what-is-apache-pinot-and-startree-section"},"The Obligatory What is Apache Pinot and StarTree Section"),(0,i.kt)("p",null,(0,i.kt)("a",{parentName:"p",href:"https://startree.ai/what-is-apache-pinot"},"Pinot")," is an open source, free-to-use, real-time, and distributed OLAP datastore, purpose built to provide ultra low-latency analytics at extremely high throughput."),(0,i.kt)("p",null,"StarTree offers a fully managed version of the Apache Pinot ",(0,i.kt)("a",{parentName:"p",href:"https://startree.ai/resources/what-is-real-time-analytics"},"real-time analytics")," system and other tools around it, such as a real-time anomaly detection and root cause analysis tool, which you can ",(0,i.kt)("a",{parentName:"p",href:"https://startree.ai/saas-signup"},"try for free"),"."),(0,i.kt)("h2",{id:"what-do-you-need-to-run-apache-pinot"},"What do you need to run Apache Pinot?"),(0,i.kt)("p",null,"The Docker image that we will use runs multiple services. To accommodate this, we recommend at a minimum the following resources in order to run the sample:"),(0,i.kt)("ul",null,(0,i.kt)("li",{parentName:"ul"},"CPUs: four or more"),(0,i.kt)("li",{parentName:"ul"},"Memory: 8 GB or more"),(0,i.kt)("li",{parentName:"ul"},"Swap: 2 GB or more"),(0,i.kt)("li",{parentName:"ul"},"Disk space: 10 GB or more")),(0,i.kt)("p",null,"Note: When importing custom data or event streaming, you may need more resources. Additionally, note that if not set, Docker will use resources from the host environment as needed and available."),(0,i.kt)("h2",{id:"step-by-step-installation-of-apache-pinot"},"Step-by-step installation of Apache Pinot"),(0,i.kt)("p",null,"For this intro tutorial, we will use Docker. Alternatively, you can run Pinot locally if you wish.\xa0"),(0,i.kt)("p",null,"The instructions use a Windows 11 computer, but they will work on Macs as well. Also note that I am using VS Code with the Docker extension installed."),(0,i.kt)("h3",{id:"step-1"},"Step 1:\xa0"),(0,i.kt)("p",null,"Make sure you have ",(0,i.kt)("a",{parentName:"p",href:"https://docs.docker.com/get-docker/"},"Docker installed")," on your machine."),(0,i.kt)("p",null,(0,i.kt)("em",{parentName:"p"},"Docker is a set of platform as a service (PaaS) products that use OS-level virtualization to deliver software in packages called containers.")),(0,i.kt)("h3",{id:"step-2"},"Step 2:"),(0,i.kt)("p",null,"Now, let\u2019s download the Docker image. On a Windows machine, start a new PowerShell command window. Note that this is not the same as a Windows PowerShell command window, as shown below.\xa0"),(0,i.kt)("p",null,(0,i.kt)("img",{parentName:"p",src:"https://www.datocms-assets.com/75153/1684419409-image7.png",alt:"Download Docker image on Windows with PowerShell command window",title:"Download Docker image on Windows with PowerShell command window"})),(0,i.kt)("p",null,"Use the following command to get (pull) the image we are looking for:"),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-bash"},"docker pull apachepinot/pinot:0.12.0\n")),(0,i.kt)("p",null,"You can also download the latest version like so:"),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-bash"},"docker pull apachepinot/pinot:latest\n")),(0,i.kt)("p",null,"Here, apachepinot is the name of the repository in Docker Hub, pinot is the name of the image, and :latest or :0.12.0 is the version for the image.\xa0 Note that we will be using the 0.12.0 version for this blog post."),(0,i.kt)("p",null,(0,i.kt)("em",{parentName:"p"},"Docker Hub is the world\u2019s largest repository of container images in the world."),"\xa0"),(0,i.kt)("p",null,"You can verify the image was downloaded or pulled by running the following command:"),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-bash"},"docker images\n")),(0,i.kt)("p",null,"It should show you the image like so:"),(0,i.kt)("p",null,(0,i.kt)("img",{parentName:"p",src:"https://www.datocms-assets.com/75153/1684420004-image3.png",alt:"Docker images command",title:"Docker images command"})),(0,i.kt)("h3",{id:"step-3"},"Step 3:"),(0,i.kt)("p",null,"Let\u2019s run a container using the Docker image that we downloaded:"),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-bash"},"docker run -it --entrypoint /bin/bash -p 9000:9000 apachepinot/pinot:0.12.0\n")),(0,i.kt)("p",null,(0,i.kt)("img",{parentName:"p",src:"https://www.datocms-assets.com/75153/1684420103-image4.png",alt:"Running a container with downloaded Docker image",title:"Running a container with downloaded Docker image"})),(0,i.kt)("p",null,"The docker run command runs the image. The ","-","p 9000:00 option maps the docker container port 9000 to the local machine port 9000. This allows us to access the Pinot UI, which defaults to port 9000 to be accessible from the localhost. We are using the \u2013entrypoint to override the default entrypoint and replace it with Bash. We want to override the default behavior so that we can start each component one at a time. The next parameter apachepinot/pinot:0.12.0 is the Docker image we pulled above."),(0,i.kt)("p",null,"After running the command, we\u2019ll find ourselves in the Docker container instance running Bash shell. We can use ls to list the contents of the Docker container as shown above."),(0,i.kt)("p",null,"If you\u2019re using VS Code, with the Docker extension installed, you can click on the Docker extension and see our container and its content:"),(0,i.kt)("p",null,(0,i.kt)("img",{parentName:"p",src:"https://www.datocms-assets.com/75153/1684421493-image11.png",alt:"VS Code Docker extension open to see container and content",title:"VS Code Docker extension open to see container and content"})),(0,i.kt)("p",null,"Click on the Docker icon in the left menu, and apachepinot/pinot:0.12.0. This should take a few seconds to connect to the running container. Now, you can navigate to the files and see what we have under the opt folder."),(0,i.kt)("h3",{id:"step-4"},"Step 4:"),(0,i.kt)("p",null,"Let\u2019s run the components that are essential to running a Pinot cluster. Change directory to the bin folder and list the contents like so:"),(0,i.kt)("p",null,(0,i.kt)("img",{parentName:"p",src:"https://www.datocms-assets.com/75153/1684421611-image10.png",alt:"Running components, directory changed to bin folder and contents listed",title:"Running components, directory changed to bin folder and contents listed"})),(0,i.kt)("p",null,"In order to start the Pinot cluster, we will need to run the following essential components:"),(0,i.kt)("ul",null,(0,i.kt)("li",{parentName:"ul"},"Apache ZooKeeper\u2122"),(0,i.kt)("li",{parentName:"ul"},"Controller"),(0,i.kt)("li",{parentName:"ul"},"Broker"),(0,i.kt)("li",{parentName:"ul"},"Server")),(0,i.kt)("p",null,"Start ZooKeeper using the following command:"),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-bash"},"./pinot-admin.sh StartZookeeper &\n")),(0,i.kt)("p",null,"pinot-admin.sh is a shell script for starting the various components. The & allows us to continue using the Bash shell. ZooKeeper is responsible for the configuration for the Pinot cluster and needs to be started first."),(0,i.kt)("p",null,"We can start the remaining components using these commands one at a time:"),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-bash"},"./pinot-admin.sh StartController &\n./pinot-admin.sh StartBroker &\n./pinot-admin.sh StartServer &\n")),(0,i.kt)("p",null,"The controller controls the cluster health and coordinates with ZooKeeper for configuration and status changes. The broker is responsible for query distribution and result collation, sometimes called Scatter-Gather. Servers manage individual table segments and perform the actual read/writes. To get a better understanding of each component, read this ",(0,i.kt)("a",{parentName:"p",href:"https://startree.ai/blog/introduction-to-apache-pinot-a-beginners-guide"},"intro to Apache Pinot"),"."),(0,i.kt)("p",null,"At this time, we should have a running Pinot cluster. We can verify via the Pinot Data Explorer by browsing to localhost:9000. You should see something like this:"),(0,i.kt)("p",null,(0,i.kt)("img",{parentName:"p",src:"https://www.datocms-assets.com/75153/1684419932-image2.png",alt:"Pinot data explorer",title:"Pinot data explorer"})),(0,i.kt)("p",null,"What just happened?"),(0,i.kt)("p",null,"Let\u2019s dive in."),(0,i.kt)("p",null,"We have started the four essential components of Pinot, however, you will note that there is not yet any data in our fresh new instance."),(0,i.kt)("p",null,"Before we create a table and load data, notice the four navigation menus on the left-hand side. You can look at the cluster status, run queries, inspect ZooKeeper, or launch the Swagger endpoints for the REST API that Pinot supports."),(0,i.kt)("p",null,"On the cluster, we notice that we have the essentials deployed: controller, broker, and server. Currently, there are no tables and no minions\u2014dispatchable components used for task management\u2014exist, though Notice also that multi-tenancy support is available in the cluster manager."),(0,i.kt)("h3",{id:"step-5"},"Step 5:"),(0,i.kt)("p",null,"Now that we have our Apache Pinot cluster ready, let\u2019s load some data. Of course, before we do that, we have to create a schema.\xa0"),(0,i.kt)("p",null,"Let\u2019s navigate to the folder:"),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-bash"},"cd /opt/pinot/examples/batch/baseballStats\n")),(0,i.kt)("p",null,"You will notice that there are the following files listed here:"),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre"},"baseballStats_offline_table_config.json \nbaseballStats_schema.json \ningestionJobSpec.yaml \nsparkIngestionJobSpec.yaml \nrawdata\n")),(0,i.kt)("p",null,"From the names, we can see that there is a schema file, a table config file, an ingestion job, and Apache Spark\u2122 ingestion job files as well as a raw data folder."),(0,i.kt)("p",null,"The content of the schema file contains both metric and dimension like so (abbreviated):"),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-json"},'{\n "metricFieldSpecs": [\n {\n "dataType": "INT",\n "name": "playerStint"\n },\n \u2026 \n {\n "dataType": "INT",\n "name": "baseOnBalls"\n },\n ],\n "dimensionFieldSpecs": [\n {\n "dataType": "STRING",\n "name": "playerID"\n },\n \u2026.\n {\n "dataType": "STRING",\n "name": "playerName"\n }\n ],\n "schemaName": "baseballStats"\n}\n')),(0,i.kt)("p",null,"To create a schema and table for the baseball stats file, run the following command from the /app/pinot/bin folder:"),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-bash"},"./pinot-admin.sh AddTable -schemaFile /opt/pinot/examples/batch/baseballStats/baseballStats_schema.json -tableConfigFile /opt/pinot/examples/batch/baseballStats/baseballStats_offline_table_config.json -exec\n")),(0,i.kt)("p",null,"You should now see the schema and table created:"),(0,i.kt)("p",null,(0,i.kt)("img",{parentName:"p",src:"https://www.datocms-assets.com/75153/1684421406-image12.png",alt:"Apache Pinot tables created",title:"Apache Pinot tables created"})),(0,i.kt)("p",null,"Next, we\u2019ll want to load some data into the table that we created. We have some sample data in the folder rawdata that we can use to load. We will need a YAML file to perform the actual ingestion job and can use the following command to import data:"),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-bash"},"./pinot-admin.sh LaunchDataIngestionJob -jobSpecFile /opt/pinot/examples/batch/baseballStats/ingestionJobSpec.yaml\n\n")),(0,i.kt)("p",null,"If you run into trouble on this step like I did, edit the ingestJobSpec.yaml file using Docker Desktop to change the inputDirURI from relative to absolute path. Then rerun the above command."),(0,i.kt)("p",null,(0,i.kt)("img",{parentName:"p",src:"https://www.datocms-assets.com/75153/1684419802-image1.png",alt:"Editing the .yaml file with Docker Desktop",title:"Editing the .yaml file with Docker Desktop"})),(0,i.kt)("p",null,"You should now be able to see the table has been populated like so:"),(0,i.kt)("p",null,(0,i.kt)("img",{parentName:"p",src:"https://www.datocms-assets.com/75153/1684421215-image8.png",alt:"Apache Pinot table populated",title:"Apache Pinot table populated"})),(0,i.kt)("p",null,"Now, let\u2019s run some queries. From localhost:9000, select the Query Console in the left-hand menu. Then type in some of these queries:"),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-sql"},"select * from baseballStats limit 10\nselect sum(runs), playerName from baseballStats group by playerName order by sum(runs) desc\n")),(0,i.kt)("p",null,"You should see results like so:"),(0,i.kt)("p",null,(0,i.kt)("img",{parentName:"p",src:"https://www.datocms-assets.com/75153/1684421163-image6.png",alt:"Apache Pinot query console",title:"Apache Pinot query console"})),(0,i.kt)("p",null,"And there you have it!"),(0,i.kt)("h2",{id:"whats-under-the-hood"},"What\u2019s under the hood?"),(0,i.kt)("p",null,"If you\u2019re curious to go a step further and see what the segments look like and what the actual data on disk looks like, keep reading! In the Tables section of localhost:9000, you can scroll down to find a segment:"),(0,i.kt)("p",null,(0,i.kt)("img",{parentName:"p",src:"https://www.datocms-assets.com/75153/1684421358-image9.png",alt:"Apache Pinot data on disk segment",title:"Apache Pinot data on disk segment"})),(0,i.kt)("p",null,"Clicking on this gives the specifics of the segment:"),(0,i.kt)("p",null,(0,i.kt)("img",{parentName:"p",src:"https://www.datocms-assets.com/75153/1684420155-image5.png",alt:"Segment specifics in Pinot UI",title:"Segment specifics in Pinot UI"})),(0,i.kt)("p",null,"Pinot allows you to easily inspect your segments and tables in one easy-to-use UI. You can find what\u2019s where and keep an eye on size, location, number of documents, etc."),(0,i.kt)("h2",{id:"conclusion"},"Conclusion"),(0,i.kt)("p",null,"Congratulations!"),(0,i.kt)("p",null,"Together, we\u2019ve:"),(0,i.kt)("ul",null,(0,i.kt)("li",{parentName:"ul"},"Installed and ran Apache Pinot components"),(0,i.kt)("li",{parentName:"ul"},"Created a table schema and a table"),(0,i.kt)("li",{parentName:"ul"},"Loaded data in a table"),(0,i.kt)("li",{parentName:"ul"},"Ran a few queries"),(0,i.kt)("li",{parentName:"ul"},"Explored the Pinot UI")),(0,i.kt)("p",null,"In my next article, we\u2019ll consume event streaming data using Apache Pinot and Apache Kafka\xae."),(0,i.kt)("p",null,"In the meantime, run more queries, load more data, and don\u2019t forget to ",(0,i.kt)("a",{parentName:"p",href:"https://communityinviter.com/apps/startreedata/startree-community"},"join the Community Slack")," for support if you get stuck!"))}d.isMDXComponent=!0}}]);