blob: 22f82b79dc96233a6e6488e122b65417f72d7e3b [file] [log] [blame]
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Pegasus | Geo</title>
<link rel="stylesheet" href="/zh/assets/css/app.css">
<link rel="shortcut icon" href="/zh/assets/images/favicon.ico">
<link rel="stylesheet" href="/zh/assets/css/utilities.min.css">
<link rel="stylesheet" href="/zh/assets/css/docsearch.v3.css">
<script src="/assets/js/jquery.min.js"></script>
<script src="/assets/js/all.min.js"></script>
<script src="/assets/js/docsearch.v3.js"></script>
<!-- Begin Jekyll SEO tag v2.8.0 -->
<title>Geo | Pegasus</title>
<meta name="generator" content="Jekyll v4.3.2" />
<meta property="og:title" content="Geo" />
<meta property="og:locale" content="en_US" />
<meta name="description" content="Pegasus GEO支持" />
<meta property="og:description" content="Pegasus GEO支持" />
<meta property="og:site_name" content="Pegasus" />
<meta property="og:type" content="article" />
<meta property="article:published_time" content="2023-11-23T14:51:44+00:00" />
<meta name="twitter:card" content="summary" />
<meta property="twitter:title" content="Geo" />
<script type="application/ld+json">
{"@context":"https://schema.org","@type":"BlogPosting","dateModified":"2023-11-23T14:51:44+00:00","datePublished":"2023-11-23T14:51:44+00:00","description":"Pegasus GEO支持","headline":"Geo","mainEntityOfPage":{"@type":"WebPage","@id":"/api/geo"},"url":"/api/geo"}</script>
<!-- End Jekyll SEO tag -->
</head>
<body>
<div class="dashboard is-full-height">
<!-- left panel -->
<div class="dashboard-panel is-medium is-hidden-mobile pl-0">
<div class="dashboard-panel-header has-text-centered">
<a href="/zh/">
<img src="/assets/images/pegasus-logo-inv.png" style="width: 80%;">
</a>
</div>
<div class="dashboard-panel-main is-scrollable pl-6">
<aside class="menu">
<p class="menu-label">Pegasus产品文档</p>
<ul class="menu-list">
<li>
<a href="/zh/docs/downloads"
class="">
下载
</a>
</li>
</ul>
<p class="menu-label">编译构建</p>
<ul class="menu-list">
<li>
<a href="/zh/docs/build/compile-by-docker"
class="">
使用Docker完成编译(推荐)
</a>
</li>
<li>
<a href="/zh/docs/build/compile-from-source"
class="">
从源码编译
</a>
</li>
</ul>
<p class="menu-label">客户端库</p>
<ul class="menu-list">
<li>
<a href="/zh/clients/java-client"
class="">
Java客户端
</a>
</li>
<li>
<a href="/zh/clients/cpp-client"
class="">
C++客户端
</a>
</li>
<li>
<a href="https://github.com/apache/incubator-pegasus/tree/master/go-client"
class="">
Golang客户端
</a>
</li>
<li>
<a href="/zh/clients/python2-client"
class="">
Python2客户端
</a>
</li>
<li>
<a href="/zh/clients/python3-client"
class="">
Python3客户端
</a>
</li>
<li>
<a href="/zh/clients/node-client"
class="">
NodeJS客户端
</a>
</li>
<li>
<a href="/zh/clients/scala-client"
class="">
Scala客户端
</a>
</li>
</ul>
<p class="menu-label">生态工具</p>
<ul class="menu-list">
<li>
<a href="/zh/docs/tools/shell"
class="">
Pegasus Shell 工具
</a>
</li>
<li>
<a href="https://github.com/pegasus-kv/admin-cli"
class="">
集群管理命令行
</a>
</li>
<li>
<a href="https://github.com/pegasus-kv/pegic"
class="">
数据访问命令行
</a>
</li>
</ul>
<p class="menu-label">用户接口</p>
<ul class="menu-list">
<li>
<a href="/zh/api/ttl"
class="">
TTL
</a>
</li>
<li>
<a href="/zh/api/single-atomic"
class="">
单行原子操作
</a>
</li>
<li>
<a href="/zh/api/redis"
class="">
Redis适配
</a>
</li>
<li>
<a href="/zh/api/geo"
class="is-active">
GEO支持
</a>
</li>
<li>
<a href="/zh/api/http"
class="">
HTTP接口
</a>
</li>
</ul>
<p class="menu-label">高效运维</p>
<ul class="menu-list">
<li>
<a href="/zh/administration/deployment"
class="">
集群部署
</a>
</li>
<li>
<a href="/zh/administration/config"
class="">
配置说明
</a>
</li>
<li>
<a href="/zh/administration/rebalance"
class="">
负载均衡
</a>
</li>
<li>
<a href="/zh/administration/monitoring"
class="">
可视化监控
</a>
</li>
<li>
<a href="/zh/administration/rolling-update"
class="">
集群升级
</a>
</li>
<li>
<a href="/zh/administration/scale-in-out"
class="">
集群扩容缩容
</a>
</li>
<li>
<a href="/zh/administration/resource-management"
class="">
资源管理
</a>
</li>
<li>
<a href="/zh/administration/cold-backup"
class="">
冷备份
</a>
</li>
<li>
<a href="/zh/administration/meta-recovery"
class="">
元数据恢复
</a>
</li>
<li>
<a href="/zh/administration/replica-recovery"
class="">
Replica数据恢复
</a>
</li>
<li>
<a href="/zh/administration/zk-migration"
class="">
Zookeeper迁移
</a>
</li>
<li>
<a href="/zh/administration/table-migration"
class="">
Table迁移
</a>
</li>
<li>
<a href="/zh/administration/table-soft-delete"
class="">
Table软删除
</a>
</li>
<li>
<a href="/zh/administration/table-env"
class="">
Table环境变量
</a>
</li>
<li>
<a href="/zh/administration/remote-commands"
class="">
远程命令
</a>
</li>
<li>
<a href="/zh/administration/partition-split"
class="">
Partition-Split
</a>
</li>
<li>
<a href="/zh/administration/duplication"
class="">
跨机房同步
</a>
</li>
<li>
<a href="/zh/administration/compression"
class="">
数据压缩
</a>
</li>
<li>
<a href="/zh/administration/throttling"
class="">
流量控制
</a>
</li>
<li>
<a href="/zh/administration/experiences"
class="">
运维经验
</a>
</li>
<li>
<a href="/zh/administration/manual-compact"
class="">
Manual Compact功能
</a>
</li>
<li>
<a href="/zh/administration/usage-scenario"
class="">
Usage Scenario功能
</a>
</li>
<li>
<a href="/zh/administration/bad-disk"
class="">
坏盘检修
</a>
</li>
<li>
<a href="/zh/administration/whitelist"
class="">
白名单
</a>
</li>
<li>
<a href="/zh/administration/backup-request"
class="">
Backup Request
</a>
</li>
<li>
<a href="/zh/administration/hotspot-detection"
class="">
热点检测
</a>
</li>
</ul>
</aside>
</div>
</div>
<!-- main section -->
<div class="dashboard-main is-scrollable">
<nav class="navbar is-hidden-desktop">
<div class="navbar-brand">
<a href="/zh/" class="navbar-item">
<!-- Pegasus Icon -->
<img src="/assets/images/pegasus-square.png">
</a>
<div class="navbar-item">
<!--A simple language switch button that only supports zh and en.-->
<!--IF its language is zh, then switches to en.-->
<!--If you don't want a url to be relativized, you can add a space explicitly into the href to
prevents a url from being relativized by polyglot.-->
<a class="button is-light is-outlined is-inverted" href=" /api/geo"><strong>En</strong></a>
</div>
<a role="button" class="navbar-burger burger" aria-label="menu" aria-expanded="false" data-target="navMenu">
<!-- Appears in mobile mode only -->
<span aria-hidden="true"></span>
<span aria-hidden="true"></span>
<span aria-hidden="true"></span>
</a>
</div>
<div class="navbar-menu" id="navMenu">
<div class="navbar-end">
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
Pegasus产品文档
</span>
</a>
<div class="navbar-dropdown">
<a href="/zh/docs/downloads"
class="navbar-item ">
下载
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
编译构建
</span>
</a>
<div class="navbar-dropdown">
<a href="/zh/docs/build/compile-by-docker"
class="navbar-item ">
使用Docker完成编译(推荐)
</a>
<a href="/zh/docs/build/compile-from-source"
class="navbar-item ">
从源码编译
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
客户端库
</span>
</a>
<div class="navbar-dropdown">
<a href="/zh/clients/java-client"
class="navbar-item ">
Java客户端
</a>
<a href="/zh/clients/cpp-client"
class="navbar-item ">
C++客户端
</a>
<a href="https://github.com/apache/incubator-pegasus/tree/master/go-client"
class="navbar-item ">
Golang客户端
</a>
<a href="/zh/clients/python2-client"
class="navbar-item ">
Python2客户端
</a>
<a href="/zh/clients/python3-client"
class="navbar-item ">
Python3客户端
</a>
<a href="/zh/clients/node-client"
class="navbar-item ">
NodeJS客户端
</a>
<a href="/zh/clients/scala-client"
class="navbar-item ">
Scala客户端
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
生态工具
</span>
</a>
<div class="navbar-dropdown">
<a href="/zh/docs/tools/shell"
class="navbar-item ">
Pegasus Shell 工具
</a>
<a href="https://github.com/pegasus-kv/admin-cli"
class="navbar-item ">
集群管理命令行
</a>
<a href="https://github.com/pegasus-kv/pegic"
class="navbar-item ">
数据访问命令行
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
用户接口
</span>
</a>
<div class="navbar-dropdown">
<a href="/zh/api/ttl"
class="navbar-item ">
TTL
</a>
<a href="/zh/api/single-atomic"
class="navbar-item ">
单行原子操作
</a>
<a href="/zh/api/redis"
class="navbar-item ">
Redis适配
</a>
<a href="/zh/api/geo"
class="navbar-item is-active">
GEO支持
</a>
<a href="/zh/api/http"
class="navbar-item ">
HTTP接口
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
高效运维
</span>
</a>
<div class="navbar-dropdown">
<a href="/zh/administration/deployment"
class="navbar-item ">
集群部署
</a>
<a href="/zh/administration/config"
class="navbar-item ">
配置说明
</a>
<a href="/zh/administration/rebalance"
class="navbar-item ">
负载均衡
</a>
<a href="/zh/administration/monitoring"
class="navbar-item ">
可视化监控
</a>
<a href="/zh/administration/rolling-update"
class="navbar-item ">
集群升级
</a>
<a href="/zh/administration/scale-in-out"
class="navbar-item ">
集群扩容缩容
</a>
<a href="/zh/administration/resource-management"
class="navbar-item ">
资源管理
</a>
<a href="/zh/administration/cold-backup"
class="navbar-item ">
冷备份
</a>
<a href="/zh/administration/meta-recovery"
class="navbar-item ">
元数据恢复
</a>
<a href="/zh/administration/replica-recovery"
class="navbar-item ">
Replica数据恢复
</a>
<a href="/zh/administration/zk-migration"
class="navbar-item ">
Zookeeper迁移
</a>
<a href="/zh/administration/table-migration"
class="navbar-item ">
Table迁移
</a>
<a href="/zh/administration/table-soft-delete"
class="navbar-item ">
Table软删除
</a>
<a href="/zh/administration/table-env"
class="navbar-item ">
Table环境变量
</a>
<a href="/zh/administration/remote-commands"
class="navbar-item ">
远程命令
</a>
<a href="/zh/administration/partition-split"
class="navbar-item ">
Partition-Split
</a>
<a href="/zh/administration/duplication"
class="navbar-item ">
跨机房同步
</a>
<a href="/zh/administration/compression"
class="navbar-item ">
数据压缩
</a>
<a href="/zh/administration/throttling"
class="navbar-item ">
流量控制
</a>
<a href="/zh/administration/experiences"
class="navbar-item ">
运维经验
</a>
<a href="/zh/administration/manual-compact"
class="navbar-item ">
Manual Compact功能
</a>
<a href="/zh/administration/usage-scenario"
class="navbar-item ">
Usage Scenario功能
</a>
<a href="/zh/administration/bad-disk"
class="navbar-item ">
坏盘检修
</a>
<a href="/zh/administration/whitelist"
class="navbar-item ">
白名单
</a>
<a href="/zh/administration/backup-request"
class="navbar-item ">
Backup Request
</a>
<a href="/zh/administration/hotspot-detection"
class="navbar-item ">
热点检测
</a>
</div>
</div>
</div>
</div>
</nav>
<nav class="navbar is-hidden-mobile">
<div class="navbar-start w-full">
<div class="navbar-item pl-0 w-full">
<!--TODO(wutao): Given the limitation of docsearch that couldn't handle multiple input,
I make searchbox only shown in desktop. Fix this issue when docsearch.js v3 released.
Related issue: https://github.com/algolia/docsearch/issues/230-->
<div id="docsearch"></div>
</div>
</div>
<div class="navbar-end">
<div class="navbar-item">
<!--A simple language switch button that only supports zh and en.-->
<!--IF its language is zh, then switches to en.-->
<!--If you don't want a url to be relativized, you can add a space explicitly into the href to
prevents a url from being relativized by polyglot.-->
<a class="button is-light is-outlined is-inverted" href=" /api/geo"><strong>En</strong></a>
</div>
</div>
</nav>
<section class="hero is-info lg:mr-3">
<div class="hero-body">
<p class="title is-size-2 is-centered">GEO支持</p>
</div>
</section>
<section class="section" style="padding-top: 2rem;">
<div class="content">
<h1 id="pegasus-geo支持">Pegasus GEO支持</h1>
<h2 id="背景">背景</h2>
<p>业务数据跟Pegasus的普通数据类似,由hashkey、sortkey、value组成。但业务数据隐含有地理信息,比如value中包含有经纬度(latitude,longitude),需要提供API进行GEO特性的支持,比如给定一个中心点坐标和一个半径,查找这个范围内的所有数据;给定两条数据的hashkey和sortkey,求这两条数据地理上的距离。</p>
<p>pegasus的GEO(Geographic)支持使用了<a href="https://github.com/google/s2geometry">S2</a>库, 主要利用其中将二维地理坐标(经纬度)与一维编码的相互转换、基于圆形的范围查询、Hilbert曲线规则等特性。在Pegasus中如何充分利用S2的这些特性,并结合Pegasus的数据分布、数据存储特性,是本文的阐述重点。</p>
<p>关于S2的实现原理细节请参考<a href="http://s2geometry.io/">S2官网</a></p>
<h2 id="坐标转换">坐标转换</h2>
<p>在S2中,可以把二维经纬度编码成一维编码,一维编码由两部分组成:立方体面、平面坐标编码,比如:</p>
<p>经纬度(116.334441,40.030202)的编码是:<code class="language-plaintext highlighter-rouge">1/223320022232200331010110113301</code>(32位),在S2中称为cellid。</p>
<p>其中,首位的<code class="language-plaintext highlighter-rouge">1</code>代表地球立方体投影的面索引,索引范围是0~5,如下图所示:</p>
<p><img src="/assets/images/geo_faces.png" alt="geo_faces.png" class="img-responsive" /></p>
<p><code class="language-plaintext highlighter-rouge">/</code>是分隔符</p>
<p><code class="language-plaintext highlighter-rouge">223320022232200331010110113301</code>(30位)是经纬度坐标经过一系列转换得到的编码,具体转换过程这里不详细描述。需要指出的是,这是一个名为Hilbert曲线编码,它最大的特点是具有稳定性、连续性。</p>
<p><img src="/assets/images/hilbert.png" alt="hilbert.png" class="img-responsive" /></p>
<p>编码由前往后按层进行,完整编码是前缀编码的子区域,每个父区域由4个子区域组成,比如<code class="language-plaintext highlighter-rouge">00</code>,<code class="language-plaintext highlighter-rouge">01</code>,<code class="language-plaintext highlighter-rouge">02</code>,<code class="language-plaintext highlighter-rouge">03</code><code class="language-plaintext highlighter-rouge">0</code>的子区域,且前者的区域范围的并集就是后者的区域范围。最多有30层,每层都有相应的cellid集合,高层cell是底层cell的父区域,高层cellid是底层cellid的前缀。</p>
<p>编码可以看作是一个4进制的数值编码,同时<strong>在数值上连续的值,在地理位置上也是连续的</strong></p>
<h2 id="编码精度">编码精度</h2>
<p>S2中的Hilbert曲线编码由30位组成,每一位代表一层划分。下表是各层单个cell的面积和cell个数。</p>
<table>
<thead>
<tr>
<th><strong>level</strong></th>
<th><strong>min area</strong></th>
<th><strong>max area</strong></th>
<th><strong>average area</strong></th>
<th><strong>units</strong></th>
<th><strong>Number of cells</strong></th>
</tr>
</thead>
<tbody>
<tr>
<td>00</td>
<td>85011012.19</td>
<td>85011012.19</td>
<td>85011012.19</td>
<td>km^2</td>
<td>6</td>
</tr>
<tr>
<td>01</td>
<td>21252753.05</td>
<td>21252753.05</td>
<td>21252753.05</td>
<td>km^2</td>
<td>24</td>
</tr>
<tr>
<td>02</td>
<td>4919708.23</td>
<td>6026521.16</td>
<td>5313188.26</td>
<td>km^2</td>
<td>96</td>
</tr>
<tr>
<td>03</td>
<td>1055377.48</td>
<td>1646455.50</td>
<td>1328297.07</td>
<td>km^2</td>
<td>384</td>
</tr>
<tr>
<td>04</td>
<td>231564.06</td>
<td>413918.15</td>
<td>332074.27</td>
<td>km^2</td>
<td>1536</td>
</tr>
<tr>
<td>05</td>
<td>53798.67</td>
<td>104297.91</td>
<td>83018.57</td>
<td>km^2</td>
<td>6K</td>
</tr>
<tr>
<td>06</td>
<td>12948.81</td>
<td>26113.30</td>
<td>20754.64</td>
<td>km^2</td>
<td>24K</td>
</tr>
<tr>
<td>07</td>
<td>3175.44</td>
<td>6529.09</td>
<td>5188.66</td>
<td>km^2</td>
<td>98K</td>
</tr>
<tr>
<td>08</td>
<td>786.20</td>
<td>1632.45</td>
<td>1297.17</td>
<td>km^2</td>
<td>393K</td>
</tr>
<tr>
<td>09</td>
<td>195.59</td>
<td>408.12</td>
<td>324.29</td>
<td>km^2</td>
<td>1573K</td>
</tr>
<tr>
<td>10</td>
<td>48.78</td>
<td>102.03</td>
<td>81.07</td>
<td>km^2</td>
<td>6M</td>
</tr>
<tr>
<td>11</td>
<td>12.18</td>
<td>25.51</td>
<td>20.27</td>
<td>km^2</td>
<td>25M</td>
</tr>
<tr>
<td>12</td>
<td>3.04</td>
<td>6.38</td>
<td>5.07</td>
<td>km^2</td>
<td>100M</td>
</tr>
<tr>
<td>13</td>
<td>0.76</td>
<td>1.59</td>
<td>1.27</td>
<td>km^2</td>
<td>402M</td>
</tr>
<tr>
<td>14</td>
<td>0.19</td>
<td>0.40</td>
<td>0.32</td>
<td>km^2</td>
<td>1610M</td>
</tr>
<tr>
<td>15</td>
<td>47520.30</td>
<td>99638.93</td>
<td>79172.67</td>
<td>m^2</td>
<td>6B</td>
</tr>
<tr>
<td>16</td>
<td>11880.08</td>
<td>24909.73</td>
<td>19793.17</td>
<td>m^2</td>
<td>25B</td>
</tr>
<tr>
<td>17</td>
<td>2970.02</td>
<td>6227.43</td>
<td>4948.29</td>
<td>m^2</td>
<td>103B</td>
</tr>
<tr>
<td>18</td>
<td>742.50</td>
<td>1556.86</td>
<td>1237.07</td>
<td>m^2</td>
<td>412B</td>
</tr>
<tr>
<td>19</td>
<td>185.63</td>
<td>389.21</td>
<td>309.27</td>
<td>m^2</td>
<td>1649B</td>
</tr>
<tr>
<td>20</td>
<td>46.41</td>
<td>97.30</td>
<td>77.32</td>
<td>m^2</td>
<td>7T</td>
</tr>
<tr>
<td>21</td>
<td>11.60</td>
<td>24.33</td>
<td>19.33</td>
<td>m^2</td>
<td>26T</td>
</tr>
<tr>
<td>22</td>
<td>2.90</td>
<td>6.08</td>
<td>4.83</td>
<td>m^2</td>
<td>105T</td>
</tr>
<tr>
<td>23</td>
<td>0.73</td>
<td>1.52</td>
<td>1.21</td>
<td>m^2</td>
<td>422T</td>
</tr>
<tr>
<td>24</td>
<td>0.18</td>
<td>0.38</td>
<td>0.30</td>
<td>m^2</td>
<td>1689T</td>
</tr>
<tr>
<td>25</td>
<td>453.19</td>
<td>950.23</td>
<td>755.05</td>
<td>cm^2</td>
<td>7e15</td>
</tr>
<tr>
<td>26</td>
<td>113.30</td>
<td>237.56</td>
<td>188.76</td>
<td>cm^2</td>
<td>27e15</td>
</tr>
<tr>
<td>27</td>
<td>28.32</td>
<td>59.39</td>
<td>47.19</td>
<td>cm^2</td>
<td>108e15</td>
</tr>
<tr>
<td>28</td>
<td>7.08</td>
<td>14.85</td>
<td>11.80</td>
<td>cm^2</td>
<td>432e15</td>
</tr>
<tr>
<td>29</td>
<td>1.77</td>
<td>3.71</td>
<td>2.95</td>
<td>cm^2</td>
<td>1729e15</td>
</tr>
<tr>
<td>30</td>
<td>0.44</td>
<td>0.93</td>
<td>0.74</td>
<td>cm^2</td>
<td>7e18</td>
</tr>
</tbody>
</table>
<h2 id="数据存储">数据存储</h2>
<p>在Pegasus中,数据存储的key是hashkey+sortkey: hashkey用于数据partition,同一hashkey的数据存储在同一replica server下的一块或多块(由rocksdb实际存储的状态决定:数据随机写入后,同一hashkey下连续的sortkey空间可能分布在多个不连续的sst文件中,进行full compact后,会分布在连续sst的内)连续区域; sortkey用于在这块(或多块)区域中做数据排序的依据。</p>
<p>经纬度经过坐标转换得到一维编码(字符串)后,就可以把这个一维编码作为key存储起来做<strong>GEO索引数据</strong>了,这里需要将这个一维编码拆分成hashkey和sortkey两部分,可以根据实际的业务场景采取不同的划分策略。</p>
<p>GEO索引数据独立于原始数据,两类数据存储在不同的table内,通过<a href="https://github.com/apache/incubator-pegasus/blob/master/src/geo/lib/geo_client.h">geo_client</a>做数据同步,同时支持原生Pegasus API和GEO API访问。</p>
<p>下面讨论GEO索引数据的构造方式。</p>
<h3 id="hashkey">hashkey</h3>
<p>hashkey直接由一维编码的前缀构成。比如在我们的LBS业务场景中,范围查询都是集中在10km半径内的圆形范围,实际测试结果是将hashkey长度定为<code class="language-plaintext highlighter-rouge">14</code>(1位face,1位分隔符<code class="language-plaintext highlighter-rouge">/</code>,12位Hilbert编码)能取得更好的性能。</p>
<blockquote>
<p><code class="language-plaintext highlighter-rouge">最小搜索层</code>为12</p>
</blockquote>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code> cellid
|--------------32 bytes-------------|
|---14 bytes----|
hashkey
</code></pre></div></div>
<h3 id="sortkey">sortkey</h3>
<p>为了满足不同半径范围、不同精度的查询,我们把cellid剩下的18位全部放到sortkey中(这并不会给底层存储带来多少压力),这可以在应用层保持比较高的灵活性,而不用修改底层的数据。在进行较大范围的临近查询时,取更少的sortkey位数(对应的cellid更短)进行数据查询;进行较小范围的临近查询或点查询时,取更多的sortkey位数(对应的cellid更长)进行数据查询。</p>
<p>尽管在30层时,cell的面积已经足够小(&lt;1cm^2),但仍有可能两条数据落在同一个cell里,所以需要区分不同的数据。这里,将原始数据的hashkey和sortkey联合起来,并追加在上述sortkey之后。</p>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code> cellid
|--------------32 bytes-------------|
|---14 bytes---||-----18 bytes------||--原始hashkey--||--原始sortkey--|
|--GEO hashkey-||----------------------GEO sortkey-------------------|
</code></pre></div></div>
<blockquote>
<p>在相同地理范围内进行数据查询时,使用短cellid查询数据查询的范围大,查询的次数更少,但得到的在区域外的无用数据更多,反之亦然—这需要在查询次数与查询到的有用数据之间做权衡。</p>
</blockquote>
<h3 id="value">value</h3>
<p>GEO API的value必须能够解析出经纬度,具体的解析方式参考<a href="https://pegasus.apache.org/api/geo#%E8%87%AA%E5%AE%9A%E4%B9%89extrator">自定义extrator</a></p>
<p>GEO索引数据的value跟原始数据的value完全相同。这里会存在一份冗余,但通常在相对廉价的磁盘存储介质上,这是可以接受的。</p>
<blockquote>
<p>我们建议业务层在使用GEO API时value只存储小数据,大数据建议采用二次索引的方式。</p>
</blockquote>
<h2 id="数据更新">数据更新</h2>
<h3 id="set">set</h3>
<p><code class="language-plaintext highlighter-rouge">set</code>操作会同时更新两个table的数据: Pegasus原始数据和GEO索引数据(数据构造方式如上所述)。</p>
<blockquote>
<p><code class="language-plaintext highlighter-rouge">set</code>操作的hashkey, sortkey是业务自己的格式,使用GEO API时并不做约束, 只是在geo client转存GEO索引数据时,会自动做如上所述的编码转换。</p>
</blockquote>
<blockquote>
<p>使用Redis API时, 参考 <a href="https://pegasus.apache.org/api/redis#geo-api">GEO API</a></p>
</blockquote>
<blockquote>
<p>实现上,<code class="language-plaintext highlighter-rouge">set</code>会首先尝试<code class="language-plaintext highlighter-rouge">get</code>出已有的数据,并将已有数据的GEO索引数据清理掉后,再写入新数据。因为新老数据的索引数据hashkey+sortkey可能不一样(即新老value根据extractor解析得到的经纬度不一样),若不清理,在进行地理搜索时将会搜索到脏数据。</p>
</blockquote>
<h3 id="del">del</h3>
<p><code class="language-plaintext highlighter-rouge">del</code>操作也会同时删除两个table的数据,原理同上。</p>
<h2 id="数据查询">数据查询</h2>
<h3 id="思路">思路</h3>
<p>直观地,集合中总的cell数量尽可能少,但同时单个cell面积尽可能小。比如:</p>
<p><img src="/assets/images/s2_cap_1.png" alt="s2_cap_1.png" class="img-responsive" /></p>
<p>虽然这样的结果更精确,但在实际测试中发现当参与计算的cell层级越大时,cellid的数量就越多,带来的client-server RPC次数更多,整个API消耗更大、延迟就越高。同时,在真实的应用场景中,太小的cell意义不大(没有数据)。</p>
<p>所以,在当前的Pegasus实现中,只联合使用两层cell,<code class="language-plaintext highlighter-rouge">最大搜索层</code><code class="language-plaintext highlighter-rouge">最小搜索层</code>, 以12层和16层为例:</p>
<p><img src="/assets/images/s2_cap_2.png" alt="s2_cap_2.png" class="img-responsive" /></p>
<h3 id="查询流程">查询流程</h3>
<p>以search_radial为例,此API的意义是给定点和半径,求出该圆形区域内的所有数据。</p>
<blockquote>
<p>这里我们只讨论圆形区域的数据查询,其他的比如多边形区域的思想是类似的。</p>
</blockquote>
<p>需利用S2提供的查询覆盖指定区域的cellid集合的API:</p>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>// Returns an S2CellUnion that covers the given region and satisfies the current options.
S2CellUnion GetCovering(const S2Region&amp; region);
</code></pre></div></div>
<blockquote>
<p><code class="language-plaintext highlighter-rouge">search_radial</code>API有两个重载函数,一个是输入经纬度,一个是输入hashky+sortkey,后者是通过key取到经纬度再转调前者。</p>
</blockquote>
<p>查询流程如下:</p>
<ol>
<li>根据经纬度、半径,求出S2Cap圆形区域<code class="language-plaintext highlighter-rouge">C</code></li>
<li>根据圆形区域、指定的<code class="language-plaintext highlighter-rouge">最小搜索层</code>,通过<code class="language-plaintext highlighter-rouge">GetCovering</code>,求出在<code class="language-plaintext highlighter-rouge">最小搜索层</code>上的cellid集合</li>
<li>遍历这些cellid,判断cellid区域跟圆形区域<code class="language-plaintext highlighter-rouge">C</code>的关系
<ol>
<li>全覆盖:取该cellid的所有数据</li>
<li>半覆盖:将该cellid按<code class="language-plaintext highlighter-rouge">最大搜索层</code>继续拆份, 判断拆分后的sub_cellid区域与圆形区域<code class="language-plaintext highlighter-rouge">C</code>的关系
<ol>
<li>相交:取该sub_cellid的所有数据</li>
<li>不相交:排除</li>
</ol>
</li>
</ol>
</li>
</ol>
<blockquote>
<p><code class="language-plaintext highlighter-rouge">最小搜索层</code><code class="language-plaintext highlighter-rouge">最大搜索层</code>的配置参考后文。</p>
</blockquote>
<p>取一个cellid的所有数据时,会根据上文的key构造规则,构造一对包含这个cellid所有数据的<code class="language-plaintext highlighter-rouge">start_sortkey</code><code class="language-plaintext highlighter-rouge">stop_sortkey</code>,再使用Pegasus的<code class="language-plaintext highlighter-rouge">scan</code>接口进行数据搜索。</p>
<ul>
<li>对于<code class="language-plaintext highlighter-rouge">3.1</code>步取到的cellid,它的长度即是hashkey的长度,它也就是hashkey,调用<code class="language-plaintext highlighter-rouge">scan(cellid, '"', "")</code>搜索数据
<ul>
<li>比如,一个12层cell <code class="language-plaintext highlighter-rouge">1/223320022232</code>被区域完全覆盖,则我们<code class="language-plaintext highlighter-rouge">scan("1/223320022232", "", "")</code></li>
</ul>
</li>
<li>对于<code class="language-plaintext highlighter-rouge">3.2.1</code>步取到的sub_cellid,hashkey是它的前缀,调用<code class="language-plaintext highlighter-rouge">scan(sub_cellid[0:hashkey_len], sub_cellid[hashkey_len:], sub_cellid[hashkey_len:])</code>搜索数据
<ul>
<li>比如,一个12层cell <code class="language-plaintext highlighter-rouge">1/223320022232</code>的子区域<code class="language-plaintext highlighter-rouge">0001</code>,<code class="language-plaintext highlighter-rouge">0002</code>,<code class="language-plaintext highlighter-rouge">0003</code>,<code class="language-plaintext highlighter-rouge">0100</code>才跟目标区域相交时,则我们<code class="language-plaintext highlighter-rouge">scan("1/223320022232", "0001", "0003")</code><code class="language-plaintext highlighter-rouge">scan("1/223320022232", "0100", "0100")</code></li>
</ul>
</li>
</ul>
<blockquote>
<p>此处还有一个根据Hilbert曲线实现的一个优化,具体参见<a href="https://github.com/apache/incubator-pegasus/blob/master/src/geo/lib/geo_client.cpp">代码</a></p>
</blockquote>
<p>得到<code class="language-plaintext highlighter-rouge">scan</code>的结果后,还需处理:</p>
<ul>
<li>计算距离:因为cellid可能只与输入区域部分重合,该点若在区域外, 需去除</li>
<li>排序:当有升序/降序要求时</li>
</ul>
<h3 id="灵活性">灵活性</h3>
<p>由于我们存储了完整的30层cellid,所以在实际使用中,可以按照自己的地理数据密度、延迟要求等情况调整API的的<code class="language-plaintext highlighter-rouge">最大搜索层</code></p>
<blockquote>
<p>默认为<code class="language-plaintext highlighter-rouge">16</code></p>
</blockquote>
<h4 id="api方式">API方式</h4>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>dsn::error_s set_max_level(int level);
</code></pre></div></div>
<h4 id="配置文件方式">配置文件方式</h4>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>[geo_client.lib]
max_level = 16
</code></pre></div></div>
<h3 id="不变性">不变性</h3>
<p>修改hashkey长度需要修改配置文件,但需注意:hashkey一旦确定,数据写入后改配置便不可修改,因为数据已按这个hashkey长度规则固化下来。</p>
<blockquote>
<p>默认为<code class="language-plaintext highlighter-rouge">12</code></p>
</blockquote>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>[geo_client.lib]
;NOTE: 'min_level' is immutable after some data has been inserted into DB by geo_client.
min_level = 12
</code></pre></div></div>
<h2 id="自定义extrator">自定义extrator</h2>
<p>目前Pegasus支持从固定格式的value中解析经纬度。经纬度以字符串形式嵌入在value中,以<code class="language-plaintext highlighter-rouge">|</code>分割, 比如:<code class="language-plaintext highlighter-rouge">.*|115.886447|41.269031|.*</code>,他们的索引由配置文件中的<code class="language-plaintext highlighter-rouge">latitude_index</code><code class="language-plaintext highlighter-rouge">longitude_index</code>确定。</p>
<h2 id="api--redis-proxy">API &amp; redis proxy</h2>
<p>Pegasus GEO特性的使用有两种方式,一是直接使用C++ geo client;二是使用redis proxy。</p>
<p><a href="https://github.com/apache/incubator-pegasus/blob/master/src/geo/lib/geo_client.h">C++ geo client代码</a>中有详细的API说明,这里不再赘述。</p>
<h2 id="配置文件">配置文件</h2>
<p>redis proxy的使用请参考<a href="redis">Redis适配</a></p>
<p>GEO API添加的配置文件如下:</p>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>[geo_client.lib]
;NOTE: 'min_level' is immutable after some data has been inserted into DB by geo_client.
min_level = 12
max_level = 16
; 用于经纬度的extrator
latitude_index = 5
longitude_index = 4
</code></pre></div></div>
<h2 id="数据导入">数据导入</h2>
<p>有的使用场景是业务已经有普通的KV数据,需要根据这份已有的KV数据转换成如上述的数据格式,我们可以使用shell工具里的<code class="language-plaintext highlighter-rouge">copy_data</code>功能来实现。比如:</p>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>copy_data -c target_cluster -a temp -g
</code></pre></div></div>
<p>此时目标集群是<code class="language-plaintext highlighter-rouge">target_cluster</code>,目标表是<code class="language-plaintext highlighter-rouge">temp</code>,他存储上述的普通数据,目标GEO索引数据表是<code class="language-plaintext highlighter-rouge">temp_geo</code>,他存储上述的GEO索引数据。</p>
<p>在进行<code class="language-plaintext highlighter-rouge">copy_data</code>操作之前,目标集群以及两个目标表都需要提前创建好。</p>
<p>数据导入完成后就可以搭建<code class="language-plaintext highlighter-rouge">redis_proxy</code>了,具体的说明参考<a href="redis">redis适配</a>,需要注意的是配置项:</p>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>[apps.proxy]
; if using GEO APIs, an extra table name which store geo index data should be appened, i.e.
arguments = redis_cluster temp temp_geo
</code></pre></div></div>
<h2 id="benchmark">benchmark</h2>
<h3 id="测试环境">测试环境</h3>
<p>机器配置:</p>
<ul>
<li>CPU:E5-2620v3 *2</li>
<li>内存:128GB</li>
<li>存储:480G SSD *8</li>
<li>网卡:1Gb</li>
</ul>
<p>集群配置:</p>
<ul>
<li>节点数:5个replica server节点(使用v1.9.2版本)</li>
<li>测试表的Partition数:128个</li>
<li>单条数据大小:120字节</li>
</ul>
<p>针对接口:</p>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>void async_search_radial(double lat_degrees,
double lng_degrees,
double radius_m,
int count,
SortType sort_type,
int timeout_ms,
geo_search_callback_t &amp;&amp;callback);
</code></pre></div></div>
<p>传递参数:</p>
<p>lat_degrees、lng_degrees:每次都选取北京五环内的随机点</p>
<p>radius_m:如下表第一列,单位米</p>
<p>count:-1,表示不限定结果数量</p>
<p>sort_type:不排序</p>
<h3 id="测试结果">测试结果</h3>
<table>
<thead>
<tr>
<th>半径(m)</th>
<th>P50(ms)</th>
<th>P75(ms)</th>
<th>P99(ms)</th>
<th>P99.9(ms)</th>
<th>平均结果条数</th>
<th>单节点QPS</th>
</tr>
</thead>
<tbody>
<tr>
<td>50</td>
<td>1.63071622</td>
<td>1.84607433</td>
<td>4.04545455</td>
<td>6.28</td>
<td>9.4608</td>
<td>740.287</td>
</tr>
<tr>
<td>100</td>
<td>1.76</td>
<td>2.33614794</td>
<td>5.4</td>
<td>6.45319149</td>
<td>38.0296</td>
<td>656.66</td>
</tr>
<tr>
<td>200</td>
<td>2.41017042</td>
<td>3.31062092</td>
<td>6.41781609</td>
<td>9.60588235</td>
<td>154.3682</td>
<td>536.624</td>
</tr>
<tr>
<td>300</td>
<td>3.30833333</td>
<td>4.21979167</td>
<td>9.4310559</td>
<td>18</td>
<td>350.9676</td>
<td>434.491</td>
</tr>
<tr>
<td>500</td>
<td>5.07763975</td>
<td>6.84964682</td>
<td>16.84931507</td>
<td>21.78082192</td>
<td>986.0826</td>
<td>347.231</td>
</tr>
<tr>
<td>1000</td>
<td>12.28164727</td>
<td>18.70972532</td>
<td>43.18181818</td>
<td>57.049698</td>
<td>3947.5294</td>
<td>204.23</td>
</tr>
<tr>
<td>2000</td>
<td>35.78666667</td>
<td>54.7300885</td>
<td>108.7331378</td>
<td>148.616578</td>
<td>15674.1198</td>
<td>98.7633</td>
</tr>
</tbody>
</table>
</div>
</section>
<footer class="footer">
<div class="container">
<div class="content is-small has-text-centered">
<div style="margin-bottom: 20px;">
<a href="http://incubator.apache.org">
<img src="/assets/images/egg-logo.png"
width="15%"
alt="Apache Incubator"/>
</a>
</div>
Copyright &copy; 2023 <a href="http://www.apache.org">The Apache Software Foundation</a>.
Licensed under the <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version
2.0</a>.
<br><br>
Apache Pegasus is an effort undergoing incubation at The Apache Software Foundation (ASF),
sponsored by the Apache Incubator. Incubation is required of all newly accepted projects
until a further review indicates that the infrastructure, communications, and decision making process
have stabilized in a manner consistent with other successful ASF projects. While incubation status is
not necessarily a reflection of the completeness or stability of the code, it does indicate that the
project has yet to be fully endorsed by the ASF.
<br><br>
Apache Pegasus, Pegasus, Apache, the Apache feather logo, and the Apache Pegasus project logo are either
registered trademarks or trademarks of The Apache Software Foundation in the United States and other
countries.
</div>
</div>
</footer>
</div>
<!-- right panel -->
<div class="dashboard-panel is-small is-scrollable is-hidden-mobile">
<p class="menu-label">
<span class="icon">
<i class="fa fa-bars" aria-hidden="true"></i>
</span>
本页导航
</p>
<ul class="menu-list">
<li><a href="#pegasus-geo支持">Pegasus GEO支持</a>
<ul>
<li><a href="#背景">背景</a></li>
<li><a href="#坐标转换">坐标转换</a></li>
<li><a href="#编码精度">编码精度</a></li>
<li><a href="#数据存储">数据存储</a>
<ul>
<li><a href="#hashkey">hashkey</a></li>
<li><a href="#sortkey">sortkey</a></li>
<li><a href="#value">value</a></li>
</ul>
</li>
<li><a href="#数据更新">数据更新</a>
<ul>
<li><a href="#set">set</a></li>
<li><a href="#del">del</a></li>
</ul>
</li>
<li><a href="#数据查询">数据查询</a>
<ul>
<li><a href="#思路">思路</a></li>
<li><a href="#查询流程">查询流程</a></li>
<li><a href="#灵活性">灵活性</a>
<ul>
<li><a href="#api方式">API方式</a></li>
<li><a href="#配置文件方式">配置文件方式</a></li>
</ul>
</li>
<li><a href="#不变性">不变性</a></li>
</ul>
</li>
<li><a href="#自定义extrator">自定义extrator</a></li>
<li><a href="#api--redis-proxy">API &amp; redis proxy</a></li>
<li><a href="#配置文件">配置文件</a></li>
<li><a href="#数据导入">数据导入</a></li>
<li><a href="#benchmark">benchmark</a>
<ul>
<li><a href="#测试环境">测试环境</a></li>
<li><a href="#测试结果">测试结果</a></li>
</ul>
</li>
</ul>
</li>
</ul>
</div>
</div>
<script src="/assets/js/app.js" type="text/javascript"></script>
<script>
docsearch({
container: '#docsearch',
appId: 'QRN30RBW0S',
indexName: 'pegasus-apache',
apiKey: 'd3a3252fa344359766707a106c4ed88f',
debug: true
});
</script>
</body>
</html>