blob: 31d761598ea90d852ed501641ed73839606e1054 [file] [log] [blame]
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Pegasus | Geo</title>
<link rel="stylesheet" href="/assets/css/app.css">
<link rel="shortcut icon" href="/assets/images/favicon.ico">
<link rel="stylesheet" href="/assets/css/utilities.min.css">
<link rel="stylesheet" href="/assets/css/docsearch.v3.css">
<script src="/assets/js/jquery.min.js"></script>
<script src="/assets/js/all.min.js"></script>
<script src="/assets/js/docsearch.v3.js"></script>
<!-- Begin Jekyll SEO tag v2.8.0 -->
<title>Geo | Pegasus</title>
<meta name="generator" content="Jekyll v4.3.3" />
<meta property="og:title" content="Geo" />
<meta property="og:locale" content="en_US" />
<meta name="description" content="Pegasus GEO" />
<meta property="og:description" content="Pegasus GEO" />
<meta property="og:site_name" content="Pegasus" />
<meta property="og:type" content="article" />
<meta property="article:published_time" content="2024-04-22T06:39:52+00:00" />
<meta name="twitter:card" content="summary" />
<meta property="twitter:title" content="Geo" />
<script type="application/ld+json">
{"@context":"https://schema.org","@type":"BlogPosting","dateModified":"2024-04-22T06:39:52+00:00","datePublished":"2024-04-22T06:39:52+00:00","description":"Pegasus GEO","headline":"Geo","mainEntityOfPage":{"@type":"WebPage","@id":"/api/geo"},"url":"/api/geo"}</script>
<!-- End Jekyll SEO tag -->
</head>
<body>
<div class="dashboard is-full-height">
<!-- left panel -->
<div class="dashboard-panel is-medium is-hidden-mobile pl-0">
<div class="dashboard-panel-header has-text-centered">
<a href="/">
<img src="/assets/images/pegasus-logo-inv.png" style="width: 80%;">
</a>
</div>
<div class="dashboard-panel-main is-scrollable pl-6">
<aside class="menu">
<p class="menu-label">The Pegasus documentation</p>
<ul class="menu-list">
<li>
<a href="/docs/downloads"
class="">
Downloads
</a>
</li>
</ul>
<p class="menu-label">Building Pegasus</p>
<ul class="menu-list">
<li>
<a href="/docs/build/compile-by-docker"
class="">
Compile by docker (recommended)
</a>
</li>
<li>
<a href="/docs/build/compile-from-source"
class="">
Compile from source
</a>
</li>
</ul>
<p class="menu-label">Client Libs</p>
<ul class="menu-list">
<li>
<a href="/clients/java-client"
class="">
Java Client
</a>
</li>
<li>
<a href="/clients/cpp-client"
class="">
C++ Client
</a>
</li>
<li>
<a href="https://github.com/apache/incubator-pegasus/tree/master/go-client"
class="">
Golang Client
</a>
</li>
<li>
<a href="/clients/python-client"
class="">
Python Client
</a>
</li>
<li>
<a href="/clients/node-client"
class="">
NodeJS Client
</a>
</li>
<li>
<a href="/clients/scala-client"
class="">
Scala Client
</a>
</li>
</ul>
<p class="menu-label">Tools</p>
<ul class="menu-list">
<li>
<a href="/docs/tools/shell"
class="">
Pegasus Shell
</a>
</li>
<li>
<a href="https://github.com/pegasus-kv/admin-cli"
class="">
Admin CLI
</a>
</li>
<li>
<a href="https://github.com/pegasus-kv/pegic"
class="">
Pegasus data access CLI
</a>
</li>
</ul>
<p class="menu-label">API</p>
<ul class="menu-list">
<li>
<a href="/api/ttl"
class="">
TTL(Time To Live)
</a>
</li>
<li>
<a href="/api/single-atomic"
class="">
Single-Atomic Operations
</a>
</li>
<li>
<a href="/api/redis"
class="">
Redis Adaption
</a>
</li>
<li>
<a href="/api/geo"
class="is-active">
GEO Support
</a>
</li>
<li>
<a href="/api/http"
class="">
HTTP API
</a>
</li>
</ul>
<p class="menu-label">Admin</p>
<ul class="menu-list">
<li>
<a href="/administration/deployment"
class="">
Deployment
</a>
</li>
<li>
<a href="/administration/config"
class="">
Configurations
</a>
</li>
<li>
<a href="/administration/rebalance"
class="">
Rebalance
</a>
</li>
<li>
<a href="/administration/monitoring"
class="">
Monitoring
</a>
</li>
<li>
<a href="/administration/rolling-update"
class="">
Rolling Restart and Upgrade
</a>
</li>
<li>
<a href="/administration/scale-in-out"
class="">
Scale-in and Scale-out
</a>
</li>
<li>
<a href="/administration/resource-management"
class="">
Resource Management
</a>
</li>
<li>
<a href="/administration/cold-backup"
class="">
Cold Backup
</a>
</li>
<li>
<a href="/administration/meta-recovery"
class="">
Metadata Recovery
</a>
</li>
<li>
<a href="/administration/replica-recovery"
class="">
Replica Data Recovery
</a>
</li>
<li>
<a href="/administration/zk-migration"
class="">
Zookeeper Migration
</a>
</li>
<li>
<a href="/administration/table-migration"
class="">
Table Migration
</a>
</li>
<li>
<a href="/administration/table-soft-delete"
class="">
Table Soft-Delete
</a>
</li>
<li>
<a href="/administration/table-env"
class="">
Table Environment Variables
</a>
</li>
<li>
<a href="/administration/remote-commands"
class="">
Remote Command
</a>
</li>
<li>
<a href="/administration/partition-split"
class="">
Partition-Split
</a>
</li>
<li>
<a href="/administration/duplication"
class="">
Duplication
</a>
</li>
<li>
<a href="/administration/compression"
class="">
Data Compression
</a>
</li>
<li>
<a href="/administration/throttling"
class="">
Throttling
</a>
</li>
<li>
<a href="/administration/experiences"
class="">
Experiences
</a>
</li>
<li>
<a href="/administration/manual-compact"
class="">
Manual Compact
</a>
</li>
<li>
<a href="/administration/usage-scenario"
class="">
Usage Scenario
</a>
</li>
<li>
<a href="/administration/bad-disk"
class="">
Bad Disk Repair
</a>
</li>
<li>
<a href="/administration/whitelist"
class="">
Replica Server Whitelist
</a>
</li>
<li>
<a href="/administration/backup-request"
class="">
Backup Request
</a>
</li>
<li>
<a href="/administration/hotspot-detection"
class="">
Hotspot Detection
</a>
</li>
</ul>
</aside>
</div>
</div>
<!-- main section -->
<div class="dashboard-main is-scrollable">
<nav class="navbar is-hidden-desktop">
<div class="navbar-brand">
<a href="/" class="navbar-item">
<!-- Pegasus Icon -->
<img src="/assets/images/pegasus-square.png">
</a>
<div class="navbar-item">
<!--A simple language switch button that only supports zh and en.-->
<!--IF its language is zh, then switches to en.-->
<a class="button is-light is-outlined is-inverted" href="/zh/api/geo"><strong></strong></a>
</div>
<a role="button" class="navbar-burger burger" aria-label="menu" aria-expanded="false" data-target="navMenu">
<!-- Appears in mobile mode only -->
<span aria-hidden="true"></span>
<span aria-hidden="true"></span>
<span aria-hidden="true"></span>
</a>
</div>
<div class="navbar-menu" id="navMenu">
<div class="navbar-end">
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
The Pegasus documentation
</span>
</a>
<div class="navbar-dropdown">
<a href="/docs/downloads"
class="navbar-item ">
Downloads
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
Building Pegasus
</span>
</a>
<div class="navbar-dropdown">
<a href="/docs/build/compile-by-docker"
class="navbar-item ">
Compile by docker (recommended)
</a>
<a href="/docs/build/compile-from-source"
class="navbar-item ">
Compile from source
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
Client Libs
</span>
</a>
<div class="navbar-dropdown">
<a href="/clients/java-client"
class="navbar-item ">
Java Client
</a>
<a href="/clients/cpp-client"
class="navbar-item ">
C++ Client
</a>
<a href="https://github.com/apache/incubator-pegasus/tree/master/go-client"
class="navbar-item ">
Golang Client
</a>
<a href="/clients/python-client"
class="navbar-item ">
Python Client
</a>
<a href="/clients/node-client"
class="navbar-item ">
NodeJS Client
</a>
<a href="/clients/scala-client"
class="navbar-item ">
Scala Client
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
Tools
</span>
</a>
<div class="navbar-dropdown">
<a href="/docs/tools/shell"
class="navbar-item ">
Pegasus Shell
</a>
<a href="https://github.com/pegasus-kv/admin-cli"
class="navbar-item ">
Admin CLI
</a>
<a href="https://github.com/pegasus-kv/pegic"
class="navbar-item ">
Pegasus data access CLI
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
API
</span>
</a>
<div class="navbar-dropdown">
<a href="/api/ttl"
class="navbar-item ">
TTL(Time To Live)
</a>
<a href="/api/single-atomic"
class="navbar-item ">
Single-Atomic Operations
</a>
<a href="/api/redis"
class="navbar-item ">
Redis Adaption
</a>
<a href="/api/geo"
class="navbar-item is-active">
GEO Support
</a>
<a href="/api/http"
class="navbar-item ">
HTTP API
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
Admin
</span>
</a>
<div class="navbar-dropdown">
<a href="/administration/deployment"
class="navbar-item ">
Deployment
</a>
<a href="/administration/config"
class="navbar-item ">
Configurations
</a>
<a href="/administration/rebalance"
class="navbar-item ">
Rebalance
</a>
<a href="/administration/monitoring"
class="navbar-item ">
Monitoring
</a>
<a href="/administration/rolling-update"
class="navbar-item ">
Rolling Restart and Upgrade
</a>
<a href="/administration/scale-in-out"
class="navbar-item ">
Scale-in and Scale-out
</a>
<a href="/administration/resource-management"
class="navbar-item ">
Resource Management
</a>
<a href="/administration/cold-backup"
class="navbar-item ">
Cold Backup
</a>
<a href="/administration/meta-recovery"
class="navbar-item ">
Metadata Recovery
</a>
<a href="/administration/replica-recovery"
class="navbar-item ">
Replica Data Recovery
</a>
<a href="/administration/zk-migration"
class="navbar-item ">
Zookeeper Migration
</a>
<a href="/administration/table-migration"
class="navbar-item ">
Table Migration
</a>
<a href="/administration/table-soft-delete"
class="navbar-item ">
Table Soft-Delete
</a>
<a href="/administration/table-env"
class="navbar-item ">
Table Environment Variables
</a>
<a href="/administration/remote-commands"
class="navbar-item ">
Remote Command
</a>
<a href="/administration/partition-split"
class="navbar-item ">
Partition-Split
</a>
<a href="/administration/duplication"
class="navbar-item ">
Duplication
</a>
<a href="/administration/compression"
class="navbar-item ">
Data Compression
</a>
<a href="/administration/throttling"
class="navbar-item ">
Throttling
</a>
<a href="/administration/experiences"
class="navbar-item ">
Experiences
</a>
<a href="/administration/manual-compact"
class="navbar-item ">
Manual Compact
</a>
<a href="/administration/usage-scenario"
class="navbar-item ">
Usage Scenario
</a>
<a href="/administration/bad-disk"
class="navbar-item ">
Bad Disk Repair
</a>
<a href="/administration/whitelist"
class="navbar-item ">
Replica Server Whitelist
</a>
<a href="/administration/backup-request"
class="navbar-item ">
Backup Request
</a>
<a href="/administration/hotspot-detection"
class="navbar-item ">
Hotspot Detection
</a>
</div>
</div>
</div>
</div>
</nav>
<nav class="navbar is-hidden-mobile">
<div class="navbar-start w-full">
<div class="navbar-item pl-0 w-full">
<!--TODO(wutao): Given the limitation of docsearch that couldn't handle multiple input,
I make searchbox only shown in desktop. Fix this issue when docsearch.js v3 released.
Related issue: https://github.com/algolia/docsearch/issues/230-->
<div id="docsearch"></div>
</div>
</div>
<div class="navbar-end">
<div class="navbar-item">
<!--A simple language switch button that only supports zh and en.-->
<!--IF its language is zh, then switches to en.-->
<a class="button is-light is-outlined is-inverted" href="/zh/api/geo"><strong></strong></a>
</div>
</div>
</nav>
<section class="hero is-info lg:mr-3">
<div class="hero-body">
<p class="title is-size-2 is-centered">GEO Support</p>
</div>
</section>
<section class="section" style="padding-top: 2rem;">
<div class="content">
<h1 id="pegasus-geo">Pegasus GEO</h1>
<h2 id="background">Background</h2>
<p>In Pegasus, if the user data is POI (Points of Interest) data, which contains geographic information, such as longitude and latitude in value, and users require Pegasus to provide interfaces to support GEO features. For example, given a center point coordinate and a radius, search for all data within this range. Given the hashkey and sortkey of two POI data, calculate the geographical distance between these two pieces of data.</p>
<p>Pegasus’s GEO (Geographic) support uses <a href="https://github.com/google/s2geometry">S2</a> library, mainly used for converting two-dimensional geographic coordinates (longitude+latitude) to one-dimensional encoding, range queries based on circles, Hilbert curve rules, and other characteristics.</p>
<p>This article will explain how to fully utilize the characteristics of S2 in Pegasus, and combine the data distribution and storage characteristics of Pegasus to support GEO features.</p>
<p>Please refer to the <a href="http://s2geometry.io/">S2 official website</a> for the implementation principle of S2.</p>
<h2 id="coordinate-transformation">Coordinate transformation</h2>
<p>In S2, two-dimensional longitude and latitude can be encoded into one-dimensional encoding, which consists of two parts: face cells and plane coordinate encoding, such as:</p>
<p>The encoding of two-dimensional coordinate (116.334441, 40.030202) is: <code class="language-plaintext highlighter-rouge">1/223320022232200331010110113301</code>(32 bytes), it is called <strong>CellId</strong> in S2.</p>
<ul>
<li>The first <code class="language-plaintext highlighter-rouge">1</code> represents the face cell index of the Earth cube projection, with an index range of 0~5, as shown in the following figure:</li>
</ul>
<p><img src="/assets/images/geo_faces.png" alt="geo_faces.png" class="img-responsive" /></p>
<ul>
<li><code class="language-plaintext highlighter-rouge">/</code> is a delimiter</li>
<li><code class="language-plaintext highlighter-rouge">223320022232200331010110113301</code>(30 bytes), is the encoding obtained through a series of transformations of latitude and longitude coordinates, and the specific transformation process is not described in detail here. It should be pointed out that this is a Hilbert curve encoding, which is characterized by stability and continuity.</li>
</ul>
<p><img src="/assets/images/hilbert.png" alt="hilbert.png" class="img-responsive" /></p>
<p>Hilbert curve encoding in S2:</p>
<ul>
<li>Encoding can be seen as a 4-digit numerical encoding</li>
<li>Encoding is done level by level from left to right, with a maximum of 30 levels</li>
<li>A code represents a geographic block area, and the longer the code, the smaller the area</li>
<li>The complete encoding is a sub-region of its prefix encoding, with each parent region consisting of four sub-regions. For example, <code class="language-plaintext highlighter-rouge">00</code>, <code class="language-plaintext highlighter-rouge">01</code>, <code class="language-plaintext highlighter-rouge">02</code>, and <code class="language-plaintext highlighter-rouge">03</code> are sub-regions of <code class="language-plaintext highlighter-rouge">0</code>, and the union of the sub-regions equal to the region of the prarent’s.</li>
<li>Numerically continuous values are also geographically adjacent, for example, the range of regions for <code class="language-plaintext highlighter-rouge">00</code> and <code class="language-plaintext highlighter-rouge">01</code> is adjacent, and the range of regions for <code class="language-plaintext highlighter-rouge">0122</code> and <code class="language-plaintext highlighter-rouge">0123</code> is also adjacent</li>
</ul>
<h2 id="encoding-accuracy">Encoding accuracy</h2>
<p>The Hilbert curve encoding in S2 consists of 30 bytes, each representing a level partition. The following table shows the area and number of individual cells in each level.</p>
<table>
<thead>
<tr>
<th><strong>level</strong></th>
<th><strong>min area</strong></th>
<th><strong>max area</strong></th>
<th><strong>average area</strong></th>
<th><strong>units</strong></th>
<th><strong>Number of cells</strong></th>
</tr>
</thead>
<tbody>
<tr>
<td>00</td>
<td>85011012.19</td>
<td>85011012.19</td>
<td>85011012.19</td>
<td>km^2</td>
<td>6</td>
</tr>
<tr>
<td>01</td>
<td>21252753.05</td>
<td>21252753.05</td>
<td>21252753.05</td>
<td>km^2</td>
<td>24</td>
</tr>
<tr>
<td>02</td>
<td>4919708.23</td>
<td>6026521.16</td>
<td>5313188.26</td>
<td>km^2</td>
<td>96</td>
</tr>
<tr>
<td>03</td>
<td>1055377.48</td>
<td>1646455.50</td>
<td>1328297.07</td>
<td>km^2</td>
<td>384</td>
</tr>
<tr>
<td>04</td>
<td>231564.06</td>
<td>413918.15</td>
<td>332074.27</td>
<td>km^2</td>
<td>1536</td>
</tr>
<tr>
<td>05</td>
<td>53798.67</td>
<td>104297.91</td>
<td>83018.57</td>
<td>km^2</td>
<td>6K</td>
</tr>
<tr>
<td>06</td>
<td>12948.81</td>
<td>26113.30</td>
<td>20754.64</td>
<td>km^2</td>
<td>24K</td>
</tr>
<tr>
<td>07</td>
<td>3175.44</td>
<td>6529.09</td>
<td>5188.66</td>
<td>km^2</td>
<td>98K</td>
</tr>
<tr>
<td>08</td>
<td>786.20</td>
<td>1632.45</td>
<td>1297.17</td>
<td>km^2</td>
<td>393K</td>
</tr>
<tr>
<td>09</td>
<td>195.59</td>
<td>408.12</td>
<td>324.29</td>
<td>km^2</td>
<td>1573K</td>
</tr>
<tr>
<td>10</td>
<td>48.78</td>
<td>102.03</td>
<td>81.07</td>
<td>km^2</td>
<td>6M</td>
</tr>
<tr>
<td>11</td>
<td>12.18</td>
<td>25.51</td>
<td>20.27</td>
<td>km^2</td>
<td>25M</td>
</tr>
<tr>
<td>12</td>
<td>3.04</td>
<td>6.38</td>
<td>5.07</td>
<td>km^2</td>
<td>100M</td>
</tr>
<tr>
<td>13</td>
<td>0.76</td>
<td>1.59</td>
<td>1.27</td>
<td>km^2</td>
<td>402M</td>
</tr>
<tr>
<td>14</td>
<td>0.19</td>
<td>0.40</td>
<td>0.32</td>
<td>km^2</td>
<td>1610M</td>
</tr>
<tr>
<td>15</td>
<td>47520.30</td>
<td>99638.93</td>
<td>79172.67</td>
<td>m^2</td>
<td>6B</td>
</tr>
<tr>
<td>16</td>
<td>11880.08</td>
<td>24909.73</td>
<td>19793.17</td>
<td>m^2</td>
<td>25B</td>
</tr>
<tr>
<td>17</td>
<td>2970.02</td>
<td>6227.43</td>
<td>4948.29</td>
<td>m^2</td>
<td>103B</td>
</tr>
<tr>
<td>18</td>
<td>742.50</td>
<td>1556.86</td>
<td>1237.07</td>
<td>m^2</td>
<td>412B</td>
</tr>
<tr>
<td>19</td>
<td>185.63</td>
<td>389.21</td>
<td>309.27</td>
<td>m^2</td>
<td>1649B</td>
</tr>
<tr>
<td>20</td>
<td>46.41</td>
<td>97.30</td>
<td>77.32</td>
<td>m^2</td>
<td>7T</td>
</tr>
<tr>
<td>21</td>
<td>11.60</td>
<td>24.33</td>
<td>19.33</td>
<td>m^2</td>
<td>26T</td>
</tr>
<tr>
<td>22</td>
<td>2.90</td>
<td>6.08</td>
<td>4.83</td>
<td>m^2</td>
<td>105T</td>
</tr>
<tr>
<td>23</td>
<td>0.73</td>
<td>1.52</td>
<td>1.21</td>
<td>m^2</td>
<td>422T</td>
</tr>
<tr>
<td>24</td>
<td>0.18</td>
<td>0.38</td>
<td>0.30</td>
<td>m^2</td>
<td>1689T</td>
</tr>
<tr>
<td>25</td>
<td>453.19</td>
<td>950.23</td>
<td>755.05</td>
<td>cm^2</td>
<td>7e15</td>
</tr>
<tr>
<td>26</td>
<td>113.30</td>
<td>237.56</td>
<td>188.76</td>
<td>cm^2</td>
<td>27e15</td>
</tr>
<tr>
<td>27</td>
<td>28.32</td>
<td>59.39</td>
<td>47.19</td>
<td>cm^2</td>
<td>108e15</td>
</tr>
<tr>
<td>28</td>
<td>7.08</td>
<td>14.85</td>
<td>11.80</td>
<td>cm^2</td>
<td>432e15</td>
</tr>
<tr>
<td>29</td>
<td>1.77</td>
<td>3.71</td>
<td>2.95</td>
<td>cm^2</td>
<td>1729e15</td>
</tr>
<tr>
<td>30</td>
<td>0.44</td>
<td>0.93</td>
<td>0.74</td>
<td>cm^2</td>
<td>7e18</td>
</tr>
</tbody>
</table>
<h2 id="data-storage">Data Storage</h2>
<p>In Pegasus, the key for data storage is combined by hashkey and sortkey: hashkey is used to determine the partition where the data is located. Data belongs to the same hashkey is stored in a logically contiguous area in the same Replica Server, and sortkey is used to sort the data in this area.</p>
<p>After converting the longitude and latitude coordinates to obtain the one-dimensional encoding <code class="language-plaintext highlighter-rouge">CellId</code>, this one-dimensional encoding can be stored as a key for <strong>GEO index data</strong>. Pegasus divides this one-dimensional encoding into two parts: hashkey and sortkey, and different byte partitioning strategies can be adopted according to actual user scenarios.</p>
<p>GEO index data is independent of the original data, and the two types of data are stored in different Pegasus tables. Pegasus uses <a href="https://github.com/apache/incubator-pegasus/blob/master/src/geo/lib/geo_client.h">GEO Client</a> to synchronize data for the two tables, and supports access to both native Pegasus API and GEO API.</p>
<p>So, when using the Pegasus GEO feature, it is necessary to create two Pegasus tables: one is the original table to store the raw data written by the user, and the other is the GEO index table to store the GEO index data generated by the GEO client’s automatic conversion of raw data.</p>
<h3 id="hashkey">hashkey</h3>
<p>Hashkey is composed of one-dimensional encoded prefixe. For example, in a user scenario, setting the hashkey length to <code class="language-plaintext highlighter-rouge">14</code> (1-byte face, 1-byte delimiter <code class="language-plaintext highlighter-rouge">/</code>, 12 byte Hilbert encoding) can achieve better performance.</p>
<blockquote>
<p>So, the <strong>minimum search level</strong> is 12</p>
</blockquote>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code> CellId
|1/223320022232..................|
|-------------32 bytes-----------|
|---14 bytes--|
hashkey
</code></pre></div></div>
<h3 id="sortkey">sortkey</h3>
<p>To meet the requirements of queries with different radius ranges and precisions, we put all the remaining 18 bytes of CellId into the sortkey.</p>
<ul>
<li>If the query is over larger radius ranges, take fewer sortkey bytes (corresponding to shorter CellId) as prefixes for data scan queries, which can reduce the number of data scans</li>
<li>If the query is over smaller radius ranges or point queries, take more sortkey bytes (corresponding to longer CellId) as prefixes for data scan queries, which can reduce the range of data scans</li>
</ul>
<p>This can maintain high flexibility for the application without modifying the underlying stored data.</p>
<blockquote>
<p>When querying data within the same geographical area (such as a circular area), using shorter CellIds to query data has larger ranges and fewer queries, but yields more useless data outside the area. Using longer CellIds to query data results in smaller range of queries, resulting in less useless data outside the region, but with higher number of queries</p>
<p>refer to: <a href="http://s2geometry.io/devguide/examples/coverings">S2 coverings</a></p>
</blockquote>
<p>Although the area of the cell is already small enough ( &lt; 1cm^2) at the 30th level, it is still possible for two POI data to fall into the same cell, so it is necessary to solve the key conflict problem based on CellId encoding. Pegasus combines the hashkey and sortkey of the original table and appends them to the sortkey of the GEO data table.</p>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code> CellId
|1/223320022232200331010110113301|
|-------------32 bytes-----------|
|---14 bytes--||-----18 bytes----||--原始hashkey--||--原始sortkey--|
|-GEO hashkey-||-------------------GEO sortkey-------------------|
</code></pre></div></div>
<h3 id="value">value</h3>
<p>When using the Pegasus GEO feature, the value must be able to extract longitude and latitude, and the extract method can be found in <a href="/api/geo#value-extrator">Value Extractor</a>.</p>
<p>The value of the GEO index table is exactly the same as the value of the original table, so there will be redundant data. Here, trades space for time to avoid secondary indexing.</p>
<blockquote>
<p>If there is a need to store large data in a single POI data and you want to save disk space, you can manually implement secondary indexing, which means storing the key of the secondary index in the GEO value and then storing the actual large value in another table.</p>
</blockquote>
<h2 id="data-updates">Data updates</h2>
<h3 id="set">set</h3>
<p><code class="language-plaintext highlighter-rouge">set</code> operation will simultaneously update the data of the two tables mentioned above, namely the Pegasus raw table data and GEO index table data, and the data generation method is also described above.</p>
<p>The hashkey and sortkey of the <code class="language-plaintext highlighter-rouge">set</code> operation are in the user’s own format and are not constrained when using the GEO APIs. The data synchronization of two tables is transparent to users and is automatically completed by the GEO client.</p>
<p>When using the Redis GEO API, refer to <a href="/api/redis#geo-api">GEO API</a></p>
<p>In the Pegasus implementation, the <code class="language-plaintext highlighter-rouge">set</code> operation first attempts to read and retrieve existing data. If the data does not exist, it directly writes data to both tables. If the data already exists, the old GEO index data will be cleaned up before writing new data. Because the index data <code class="language-plaintext highlighter-rouge">&lt;hashkey, sortkey&gt;</code> of new and old data may be different (i.e., the longitudes and latitudes obtained by the extractor for new and old values are different). If not cleaned up, there will be garbage and dirty data in the GEO index table, causing waste of disk space and dirty data will also be found during geographic range queries (i.e. <code class="language-plaintext highlighter-rouge">GEORADIUS</code>).</p>
<h3 id="del">del</h3>
<p>The <code class="language-plaintext highlighter-rouge">del</code> operation will delete data from both tables simultaneously, following the same principle above.</p>
<h2 id="data-queries">Data queries</h2>
<h3 id="design">Design</h3>
<p>Geographic range queries will be converted into multiple scan operations by Pegasus, with each scan corresponding to all data scans within a CellId range. To achieve higher performance, it is necessary to reduce the total number of scan operations and the amount of data per scan operation, which means reducing the total number of CellIds and the area of a single CellId.</p>
<p>For example, when performing a range query with the red circle, the CellId query set with a blue blocks can be used as:</p>
<p><img src="/assets/images/s2_cap_1.png" alt="s2_cap_1.png" class="img-responsive" /></p>
<p>Although such results are more accurate, but there are more CellIds involved in the calculation, resulting in more client-server RPCs, higher network overhead, and higher latency. In addition, in real usage scenarios, CellId that is too small may not have POI data, but it will still consume one RPC.</p>
<p>So, in the current Pegasus implementation, only two levels of cells, the <code class="language-plaintext highlighter-rouge">minimum search level</code> and the <code class="language-plaintext highlighter-rouge">maximum search level</code>, are used together. Taking levels 12 and 16 as examples, the CellId query set obtained is shown in the blue blocks as:</p>
<p><img src="/assets/images/s2_cap_2.png" alt="s2_cap_2.png" class="img-responsive" /></p>
<h3 id="query-process">Query process</h3>
<p>Taking <code class="language-plaintext highlighter-rouge">search_radial</code> as an example, it queries all POI data within the circular area according to the given center point and radius.</p>
<blockquote>
<p>Here we only discuss POI data queries for circular regions, while the idea for other regions such as polygonal regions is similar.</p>
</blockquote>
<p>Use the S2 API to query the CellId set that covers the given region:</p>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>// Returns an S2CellUnion that covers the given region and satisfies the current options.
S2CellUnion GetCovering(const S2Region&amp; region);
</code></pre></div></div>
<blockquote>
<p><code class="language-plaintext highlighter-rouge">search_radial</code> has two overloaded functions, one is to input longitude and latitude, and the other is to input hashky and sortkey. The latter query the value from the raw data table through the keys, extracts the longitude and latitude from the value, and then invoke the former.</p>
</blockquote>
<p>Query process:</p>
<ol>
<li>Calculate the circular region S2Cap <code class="language-plaintext highlighter-rouge">C</code> based on longitude, latitude, and radius</li>
<li>Based on the circular region and the specified <code class="language-plaintext highlighter-rouge">minimum search level</code>, calculate the CellId set on the <code class="language-plaintext highlighter-rouge">minimum search level</code> using <code class="language-plaintext highlighter-rouge">GetCovering</code></li>
<li>Traverse these CellIds to determine the relationship between the CellId region and the circular region <code class="language-plaintext highlighter-rouge">C</code>
<ol>
<li>Full coverage: Retrieve all POI data within the CellId</li>
<li>Half coverage: Split the CellId according to the <code class="language-plaintext highlighter-rouge">maximum search level</code> and determine the relationship between sub_CellId region and the circular region <code class="language-plaintext highlighter-rouge">C</code>
<ol>
<li>Overlay/Intersection: Take all the POI data in the sub_CellId</li>
<li>Disjoint: Discard</li>
</ol>
</li>
</ol>
</li>
</ol>
<blockquote>
<p>The configuration of the <code class="language-plaintext highlighter-rouge">minimum search level</code> and the <code class="language-plaintext highlighter-rouge">maximum search level</code> is referred to in the following documents</p>
<p>The CellId length of the <code class="language-plaintext highlighter-rouge">minimum search level</code> determines the hashkey length of the data in GEO index table.</p>
</blockquote>
<p>When querying all the POI data of a CellId, a pair of <code class="language-plaintext highlighter-rouge">start_sortkey</code> and <code class="language-plaintext highlighter-rouge">stop_sortkey</code> will be constructed which contain all the POI data of the CellId according to the key construction rules in the previous documents, then use Pegasus’ <code class="language-plaintext highlighter-rouge">scan</code> interface to query data.</p>
<ul>
<li>For the <code class="language-plaintext highlighter-rouge">minimum search level</code> CellId encoding obtain in step <code class="language-plaintext highlighter-rouge">3.1</code>, it is also the hashkey of the data in GEO index table, then call Pegasus <code class="language-plaintext highlighter-rouge">scan(CellId, "", "")</code> to query all POI data
<ul>
<li>For example, a cell in level 12, <code class="language-plaintext highlighter-rouge">1/223320022232</code> is full covered by the region, then call <code class="language-plaintext highlighter-rouge">scan("1/223320022232", "", "")</code> to query all POI data</li>
</ul>
</li>
<li>For the sub_CellId set obtain in step <code class="language-plaintext highlighter-rouge">3.2.1</code>, the hashkey is their prefix, call <code class="language-plaintext highlighter-rouge">scan(sub_CellId_common_prefix, sub_CellId1, sub_CellId2)</code> to query POI data
<ul>
<li>sub_CellId_common_prefix is the common prefix of the CellIds in sub_CellId set, its length is the length of hashkey. All the CellIds between sub_CellId1 and sub_CellId2 are continuous and all are in sub_CellId set, their length is the length of (<code class="language-plaintext highlighter-rouge">maximum search level</code> - <code class="language-plaintext highlighter-rouge">minimum search level</code>)</li>
<li>For example, when the sub-regions <code class="language-plaintext highlighter-rouge">0001</code>,<code class="language-plaintext highlighter-rouge">0002</code>,<code class="language-plaintext highlighter-rouge">0003</code> and <code class="language-plaintext highlighter-rouge">0100</code> of a cell in level 12 <code class="language-plaintext highlighter-rouge">1/223320022232</code> are intersect with the search region, then call <code class="language-plaintext highlighter-rouge">scan("1/223320022232", "0001", "0003")</code><code class="language-plaintext highlighter-rouge">scan("1/223320022232", "0100", "0100")</code></li>
</ul>
</li>
</ul>
<p>After obtaining the result of <code class="language-plaintext highlighter-rouge">scan</code>, further processing is required:</p>
<ul>
<li>Calculate distance: Because CellId may only partially overlap with the search region, if the POI is outside the search region, discarded it</li>
<li>Sorting: When there is a requirement for ascending/descending order</li>
</ul>
<h3 id="flexibility">Flexibility</h3>
<p>Due to storing the complete 30 levels of CellIds, in practical use, we can adjust the <code class="language-plaintext highlighter-rouge">maximum search level</code> according to the geographic data density, network IO, disk IO conditions.</p>
<blockquote>
<p><code class="language-plaintext highlighter-rouge">maximum search level</code> is 16 by default.</p>
</blockquote>
<h4 id="api-method">API method</h4>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>dsn::error_s set_max_level(int level);
</code></pre></div></div>
<h4 id="configuration-file-method">Configuration file method</h4>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>[geo_client.lib]
max_level = 16
</code></pre></div></div>
<h3 id="invariance">Invariance</h3>
<p>Due to the fact that the <code class="language-plaintext highlighter-rouge">minimum search level</code> determines the length of the hashkey in GEO index table, once the data is written to Pegasus, the <code class="language-plaintext highlighter-rouge">minimum search level</code> cannot be modified because the data has been persisted according to this hashkey length rule.</p>
<p>The data needs to be reconstructed if modification is required.</p>
<blockquote>
<p><code class="language-plaintext highlighter-rouge">minimum search level</code> is 12 by default.</p>
</blockquote>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>[geo_client.lib]
;NOTE: 'min_level' is immutable after some data has been inserted into DB by geo_client.
min_level = 12
</code></pre></div></div>
<h2 id="value-extractor">Value extractor</h2>
<p>Currently, Pegasus supports extract longitude and latitude from fixed format values. Longitude and latitude are serialized as strings in value, separated by <code class="language-plaintext highlighter-rouge">|</code>.</p>
<p>For example: value can be <code class="language-plaintext highlighter-rouge">.*|115.886447|41.269031|.*</code>, the index of longitude and latitude in value is determined by the <code class="language-plaintext highlighter-rouge">latitude_index</code> and <code class="language-plaintext highlighter-rouge">longitude_index</code>.</p>
<h2 id="api--redis-proxy">API &amp; Redis Proxy</h2>
<p>There are two ways to use Pegasus GEO features: one is to directly use C++ GEO Client, and the other is to use Redis Proxy.</p>
<p><a href="https://github.com/apache/incubator-pegasus/blob/master/src/geo/lib/geo_client.h">C++ GEO client codebase</a>, there is a detailed API description.</p>
<h2 id="configuration">Configuration</h2>
<p>Please refer to the usage of Redis Proxy <a href="redis">Redis Adaption</a>.</p>
<p>The configuration files added by GEO feature are as follows:</p>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>[geo_client.lib]
;NOTE: 'min_level' is immutable after some data has been inserted into DB by geo_client.
min_level = 12
max_level = 16
; Used by 'value extractor'
latitude_index = 5
longitude_index = 4
</code></pre></div></div>
<h2 id="data-import-in-batch">Data import in batch</h2>
<p>In some usage scenarios, users already has a raw data table which the values contain longitudes and latitudes, then requires constructing the GEO index table mentioned above. The <a href="/docs/tools/shell/#copy_data">copy_data</a> function in the shell tool to achieve this. For example:</p>
<p>Before the <code class="language-plaintext highlighter-rouge">copy_data</code> operation, the target cluster and two target tables (i.e., the raw data table <code class="language-plaintext highlighter-rouge">temp</code> and GEO index table <code class="language-plaintext highlighter-rouge">temp_geo</code>) are needed to be created at first.</p>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>copy_data -c target_cluster -a temp -g
</code></pre></div></div>
<p>After the data import is completed, Redis Proxy can be set up, please refer to <a href="redis">Redis Adaption</a>. For specific instructions:</p>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>[apps.proxy]
; if using GEO APIs, an extra table name which store geo index data should be appened, i.e.
arguments = redis_cluster temp temp_geo
</code></pre></div></div>
<h2 id="benchmark">Benchmark</h2>
<h3 id="environment">Environment</h3>
<h4 id="hardware">Hardware</h4>
<ul>
<li>CPU: E5-2620v3 * 2</li>
<li>Memory: 128GB</li>
<li>Disk: capacity 480GB SSD * 8</li>
<li>Network card: bandwidth 1Gb</li>
</ul>
<h4 id="cluster">Cluster</h4>
<ul>
<li>Replica Server count:5</li>
<li>Version: v1.9.2</li>
<li>Partition count of the test table:128</li>
<li>Single data size: 120 bytes</li>
</ul>
<h4 id="testing-interface">Testing interface</h4>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>void async_search_radial(double lat_degrees,
double lng_degrees,
double radius_m,
int count,
SortType sort_type,
int timeout_ms,
geo_search_callback_t &amp;&amp;callback);
</code></pre></div></div>
<p><strong>Parameters</strong></p>
<ul>
<li>lat_degrees, lng_degrees: Select random points within the 5th-Ring Road of Beijing every query</li>
<li>radius_m: The first column of the following table, in meters</li>
<li>count: -1, which indicates an unlimited number of results</li>
<li>sort_type: un-ordered</li>
</ul>
<h3 id="result">Result</h3>
<table>
<thead>
<tr>
<th>Radius(m)</th>
<th>P50(ms)</th>
<th>P75(ms)</th>
<th>P99(ms)</th>
<th>P99.9(ms)</th>
<th>Avg result count</th>
<th>QPS per node</th>
</tr>
</thead>
<tbody>
<tr>
<td>50</td>
<td>1.63071622</td>
<td>1.84607433</td>
<td>4.04545455</td>
<td>6.28</td>
<td>9.4608</td>
<td>740.287</td>
</tr>
<tr>
<td>100</td>
<td>1.76</td>
<td>2.33614794</td>
<td>5.4</td>
<td>6.45319149</td>
<td>38.0296</td>
<td>656.66</td>
</tr>
<tr>
<td>200</td>
<td>2.41017042</td>
<td>3.31062092</td>
<td>6.41781609</td>
<td>9.60588235</td>
<td>154.3682</td>
<td>536.624</td>
</tr>
<tr>
<td>300</td>
<td>3.30833333</td>
<td>4.21979167</td>
<td>9.4310559</td>
<td>18</td>
<td>350.9676</td>
<td>434.491</td>
</tr>
<tr>
<td>500</td>
<td>5.07763975</td>
<td>6.84964682</td>
<td>16.84931507</td>
<td>21.78082192</td>
<td>986.0826</td>
<td>347.231</td>
</tr>
<tr>
<td>1000</td>
<td>12.28164727</td>
<td>18.70972532</td>
<td>43.18181818</td>
<td>57.049698</td>
<td>3947.5294</td>
<td>204.23</td>
</tr>
<tr>
<td>2000</td>
<td>35.78666667</td>
<td>54.7300885</td>
<td>108.7331378</td>
<td>148.616578</td>
<td>15674.1198</td>
<td>98.7633</td>
</tr>
</tbody>
</table>
</div>
</section>
<footer class="footer">
<div class="container">
<div class="content is-small has-text-centered">
<div style="margin-bottom: 20px;">
<a href="http://incubator.apache.org">
<img src="/assets/images/egg-logo.png"
width="15%"
alt="Apache Incubator"/>
</a>
</div>
Copyright &copy; 2023 <a href="http://www.apache.org">The Apache Software Foundation</a>.
Licensed under the <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version
2.0</a>.
<br><br>
Apache Pegasus is an effort undergoing incubation at The Apache Software Foundation (ASF),
sponsored by the Apache Incubator. Incubation is required of all newly accepted projects
until a further review indicates that the infrastructure, communications, and decision making process
have stabilized in a manner consistent with other successful ASF projects. While incubation status is
not necessarily a reflection of the completeness or stability of the code, it does indicate that the
project has yet to be fully endorsed by the ASF.
<br><br>
Apache Pegasus, Pegasus, Apache, the Apache feather logo, and the Apache Pegasus project logo are either
registered trademarks or trademarks of The Apache Software Foundation in the United States and other
countries.
</div>
</div>
</footer>
</div>
<!-- right panel -->
<div class="dashboard-panel is-small is-scrollable is-hidden-mobile">
<p class="menu-label">
<span class="icon">
<i class="fa fa-bars" aria-hidden="true"></i>
</span>
Table of contents
</p>
<ul class="menu-list">
<li><a href="#pegasus-geo">Pegasus GEO</a>
<ul>
<li><a href="#background">Background</a></li>
<li><a href="#coordinate-transformation">Coordinate transformation</a></li>
<li><a href="#encoding-accuracy">Encoding accuracy</a></li>
<li><a href="#data-storage">Data Storage</a>
<ul>
<li><a href="#hashkey">hashkey</a></li>
<li><a href="#sortkey">sortkey</a></li>
<li><a href="#value">value</a></li>
</ul>
</li>
<li><a href="#data-updates">Data updates</a>
<ul>
<li><a href="#set">set</a></li>
<li><a href="#del">del</a></li>
</ul>
</li>
<li><a href="#data-queries">Data queries</a>
<ul>
<li><a href="#design">Design</a></li>
<li><a href="#query-process">Query process</a></li>
<li><a href="#flexibility">Flexibility</a>
<ul>
<li><a href="#api-method">API method</a></li>
<li><a href="#configuration-file-method">Configuration file method</a></li>
</ul>
</li>
<li><a href="#invariance">Invariance</a></li>
</ul>
</li>
<li><a href="#value-extractor">Value extractor</a></li>
<li><a href="#api--redis-proxy">API &amp; Redis Proxy</a></li>
<li><a href="#configuration">Configuration</a></li>
<li><a href="#data-import-in-batch">Data import in batch</a></li>
<li><a href="#benchmark">Benchmark</a>
<ul>
<li><a href="#environment">Environment</a>
<ul>
<li><a href="#hardware">Hardware</a></li>
<li><a href="#cluster">Cluster</a></li>
<li><a href="#testing-interface">Testing interface</a></li>
</ul>
</li>
<li><a href="#result">Result</a></li>
</ul>
</li>
</ul>
</li>
</ul>
</div>
</div>
<script src="/assets/js/app.js" type="text/javascript"></script>
<script>
docsearch({
container: '#docsearch',
appId: 'QRN30RBW0S',
indexName: 'pegasus-apache',
apiKey: 'd3a3252fa344359766707a106c4ed88f',
debug: true
});
</script>
</body>
</html>