docs/usecases/ApacheSedonaSQL.ipynb - sedona - Git at Google

 {
  "cells": [
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "```\n",
     "Licensed to the Apache Software Foundation (ASF) under one\n",
     "or more contributor license agreements.  See the NOTICE file\n",
     "distributed with this work for additional information\n",
     "regarding copyright ownership.  The ASF licenses this file\n",
     "to you under the Apache License, Version 2.0 (the\n",
     "\"License\"); you may not use this file except in compliance\n",
     "with the License.  You may obtain a copy of the License at\n",
     "  http://www.apache.org/licenses/LICENSE-2.0\n",
     "Unless required by applicable law or agreed to in writing,\n",
     "software distributed under the License is distributed on an\n",
     "\"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY\n",
     "KIND, either express or implied.  See the License for the\n",
     "specific language governing permissions and limitations\n",
     "under the License.\n",
     "```"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {
     "tags": []
    },
    "outputs": [],
    "source": [
     "import os\n",
     "\n",
     "import geopandas as gpd\n",
     "from pyspark.sql import SparkSession\n",
     "\n",
     "from sedona.spark import *"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "tags": []
    },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       ":: loading settings :: url = jar:file:/home/jovyan/spark-3.4.2-bin-hadoop3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
       "Ivy Default Cache set to: /home/jovyan/.ivy2/cache\n",
       "The jars for the packages stored in: /home/jovyan/.ivy2/jars\n",
       "org.apache.sedona#sedona-spark-3.4_2.12 added as a dependency\n",
       "org.datasyslab#geotools-wrapper added as a dependency\n",
       "uk.co.gresearch.spark#spark-extension_2.12 added as a dependency\n",
       ":: resolving dependencies :: org.apache.spark#spark-submit-parent-713da225-9e04-45e3-9233-485d4b212b3c;1.0\n",
       "\tconfs: [default]\n",
       "\tfound org.apache.sedona#sedona-spark-3.4_2.12;1.6.0 in central\n",
       "\tfound org.apache.sedona#sedona-common;1.6.0 in central\n",
       "\tfound org.apache.commons#commons-math3;3.6.1 in central\n",
       "\tfound org.locationtech.jts#jts-core;1.19.0 in central\n",
       "\tfound org.wololo#jts2geojson;0.16.1 in central\n",
       "\tfound org.locationtech.spatial4j#spatial4j;0.8 in central\n",
       "\tfound com.google.geometry#s2-geometry;2.0.0 in central\n",
       "\tfound com.google.guava#guava;25.1-jre in central\n",
       "\tfound com.google.code.findbugs#jsr305;3.0.2 in central\n",
       "\tfound org.checkerframework#checker-qual;2.0.0 in central\n",
       "\tfound com.google.errorprone#error_prone_annotations;2.1.3 in central\n",
       "\tfound com.google.j2objc#j2objc-annotations;1.1 in central\n",
       "\tfound org.codehaus.mojo#animal-sniffer-annotations;1.14 in central\n",
       "\tfound com.uber#h3;4.1.1 in central\n",
       "\tfound net.sf.geographiclib#GeographicLib-Java;1.52 in central\n",
       "\tfound com.github.ben-manes.caffeine#caffeine;2.9.2 in central\n",
       "\tfound org.checkerframework#checker-qual;3.10.0 in central\n",
       "\tfound com.google.errorprone#error_prone_annotations;2.5.1 in central\n",
       "\tfound org.apache.sedona#sedona-spark-common-3.4_2.12;1.6.0 in central\n",
       "\tfound commons-lang#commons-lang;2.6 in central\n",
       "\tfound org.scala-lang.modules#scala-collection-compat_2.12;2.5.0 in central\n",
       "\tfound org.beryx#awt-color-factory;1.0.0 in central\n",
       "\tfound org.datasyslab#geotools-wrapper;1.6.0-28.2 in central\n",
       "\tfound uk.co.gresearch.spark#spark-extension_2.12;2.11.0-3.4 in central\n",
       "\tfound com.github.scopt#scopt_2.12;4.1.0 in central\n",
       ":: resolution report :: resolve 2280ms :: artifacts dl 96ms\n",
       "\t:: modules in use:\n",
       "\tcom.github.ben-manes.caffeine#caffeine;2.9.2 from central in [default]\n",
       "\tcom.github.scopt#scopt_2.12;4.1.0 from central in [default]\n",
       "\tcom.google.code.findbugs#jsr305;3.0.2 from central in [default]\n",
       "\tcom.google.errorprone#error_prone_annotations;2.5.1 from central in [default]\n",
       "\tcom.google.geometry#s2-geometry;2.0.0 from central in [default]\n",
       "\tcom.google.guava#guava;25.1-jre from central in [default]\n",
       "\tcom.google.j2objc#j2objc-annotations;1.1 from central in [default]\n",
       "\tcom.uber#h3;4.1.1 from central in [default]\n",
       "\tcommons-lang#commons-lang;2.6 from central in [default]\n",
       "\tnet.sf.geographiclib#GeographicLib-Java;1.52 from central in [default]\n",
       "\torg.apache.commons#commons-math3;3.6.1 from central in [default]\n",
       "\torg.apache.sedona#sedona-common;1.6.0 from central in [default]\n",
       "\torg.apache.sedona#sedona-spark-3.4_2.12;1.6.0 from central in [default]\n",
       "\torg.apache.sedona#sedona-spark-common-3.4_2.12;1.6.0 from central in [default]\n",
       "\torg.beryx#awt-color-factory;1.0.0 from central in [default]\n",
       "\torg.checkerframework#checker-qual;3.10.0 from central in [default]\n",
       "\torg.codehaus.mojo#animal-sniffer-annotations;1.14 from central in [default]\n",
       "\torg.datasyslab#geotools-wrapper;1.6.0-28.2 from central in [default]\n",
       "\torg.locationtech.jts#jts-core;1.19.0 from central in [default]\n",
       "\torg.locationtech.spatial4j#spatial4j;0.8 from central in [default]\n",
       "\torg.scala-lang.modules#scala-collection-compat_2.12;2.5.0 from central in [default]\n",
       "\torg.wololo#jts2geojson;0.16.1 from central in [default]\n",
       "\tuk.co.gresearch.spark#spark-extension_2.12;2.11.0-3.4 from central in [default]\n",
       "\t:: evicted modules:\n",
       "\torg.checkerframework#checker-qual;2.0.0 by [org.checkerframework#checker-qual;3.10.0] in [default]\n",
       "\tcom.google.errorprone#error_prone_annotations;2.1.3 by [com.google.errorprone#error_prone_annotations;2.5.1] in [default]\n",
       "\t---------------------------------------------------------------------\n",
       "\t|                  |            modules            ||   artifacts   |\n",
       "\t|       conf       | number| search|dwnlded|evicted|| number|dwnlded|\n",
       "\t---------------------------------------------------------------------\n",
       "\t|      default     |   25  |   0   |   0   |   2   ||   23  |   0   |\n",
       "\t---------------------------------------------------------------------\n",
       ":: retrieving :: org.apache.spark#spark-submit-parent-713da225-9e04-45e3-9233-485d4b212b3c\n",
       "\tconfs: [default]\n",
       "\t0 artifacts copied, 23 already retrieved (0kB/88ms)\n",
       "24/05/22 18:06:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
       "Setting default log level to \"WARN\".\n",
       "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
       "                                                                                \r"
      ]
     }
    ],
    "source": [
     "config = (\n",
     "    SedonaContext.builder()\n",
     "    .config(\n",
     "        \"spark.jars.packages\",\n",
     "        \"uk.co.gresearch.spark:spark-extension_2.12:2.11.0-3.4\",\n",
     "    )\n",
     "    .master(\"spark://localhost:7077\")\n",
     "    .getOrCreate()\n",
     ")\n",
     "\n",
     "sedona = SedonaContext.create(config)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "## Geometry Constructors"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "### ST_Point"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
       "[Stage 4:>                                                          (0 + 1) / 1]\r"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "+-----------------+\n",
       "|     arealandmark|\n",
       "+-----------------+\n",
       "|POINT (1.1 101.1)|\n",
       "|POINT (2.1 102.1)|\n",
       "|POINT (3.1 103.1)|\n",
       "|POINT (4.1 104.1)|\n",
       "|POINT (5.1 105.1)|\n",
       "+-----------------+\n",
       "only showing top 5 rows\n",
       "\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
       "                                                                                \r"
      ]
     }
    ],
    "source": [
     "point_csv_df = (\n",
     "    sedona.read.format(\"csv\")\n",
     "    .option(\"delimiter\", \",\")\n",
     "    .option(\"header\", \"false\")\n",
     "    .load(\"data/testpoint.csv\")\n",
     ")\n",
     "\n",
     "point_csv_df.createOrReplaceTempView(\"pointtable\")\n",
     "\n",
     "point_df = sedona.sql(\n",
     "    \"select ST_Point(cast(pointtable._c0 as Decimal(24,20)), cast(pointtable._c1 as Decimal(24,20))) as arealandmark from pointtable\"\n",
     ")\n",
     "point_df.show(5)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "### ST_GeomFromText"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "+----------------+--------------------+\n",
       "|            name|         countyshape|\n",
       "+----------------+--------------------+\n",
       "|   Cuming County|POLYGON ((-97.019...|\n",
       "|Wahkiakum County|POLYGON ((-123.43...|\n",
       "|  De Baca County|POLYGON ((-104.56...|\n",
       "|Lancaster County|POLYGON ((-96.910...|\n",
       "| Nuckolls County|POLYGON ((-98.273...|\n",
       "+----------------+--------------------+\n",
       "only showing top 5 rows\n",
       "\n"
      ]
     }
    ],
    "source": [
     "polygon_wkt_df = (\n",
     "    sedona.read.format(\"csv\")\n",
     "    .option(\"delimiter\", \"\\t\")\n",
     "    .option(\"header\", \"false\")\n",
     "    .load(\"data/county_small.tsv\")\n",
     ")\n",
     "\n",
     "polygon_wkt_df.createOrReplaceTempView(\"polygontable\")\n",
     "polygon_df = sedona.sql(\n",
     "    \"select polygontable._c6 as name, ST_GeomFromText(polygontable._c0) as countyshape from polygontable\"\n",
     ")\n",
     "polygon_df.show(5)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "### ST_GeomFromWKB"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "+----------------+--------------------+\n",
       "|            name|         countyshape|\n",
       "+----------------+--------------------+\n",
       "|   Cuming County|POLYGON ((-97.019...|\n",
       "|Wahkiakum County|POLYGON ((-123.43...|\n",
       "|  De Baca County|POLYGON ((-104.56...|\n",
       "|Lancaster County|POLYGON ((-96.910...|\n",
       "| Nuckolls County|POLYGON ((-98.273...|\n",
       "+----------------+--------------------+\n",
       "only showing top 5 rows\n",
       "\n"
      ]
     }
    ],
    "source": [
     "polygon_wkb_df = (\n",
     "    sedona.read.format(\"csv\")\n",
     "    .option(\"delimiter\", \"\\t\")\n",
     "    .option(\"header\", \"false\")\n",
     "    .load(\"data/county_small_wkb.tsv\")\n",
     ")\n",
     "\n",
     "polygon_wkb_df.createOrReplaceTempView(\"polygontable\")\n",
     "polygon_df = sedona.sql(\n",
     "    \"select polygontable._c6 as name, ST_GeomFromWKB(polygontable._c0) as countyshape from polygontable\"\n",
     ")\n",
     "polygon_df.show(5)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "### ST_GeomFromGeoJSON"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "+--------------------+\n",
       "|         countyshape|\n",
       "+--------------------+\n",
       "|POLYGON ((-87.621...|\n",
       "|POLYGON ((-85.719...|\n",
       "|POLYGON ((-86.000...|\n",
       "|POLYGON ((-86.574...|\n",
       "|POLYGON ((-85.382...|\n",
       "+--------------------+\n",
       "only showing top 5 rows\n",
       "\n"
      ]
     }
    ],
    "source": [
     "polygon_json_df = (\n",
     "    sedona.read.format(\"csv\")\n",
     "    .option(\"delimiter\", \"\\t\")\n",
     "    .option(\"header\", \"false\")\n",
     "    .load(\"data/testPolygon.json\")\n",
     ")\n",
     "\n",
     "polygon_json_df.createOrReplaceTempView(\"polygontable\")\n",
     "polygon_df = sedona.sql(\n",
     "    \"select ST_GeomFromGeoJSON(polygontable._c0) as countyshape from polygontable\"\n",
     ")\n",
     "polygon_df.show(5)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "## Spatial Operations"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "### Spatial Join - Distance Join"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "== Physical Plan ==\n",
       "BroadcastIndexJoin pointshape2#257: geometry, LeftSide, LeftSide, Inner, INTERSECTS, ( **org.apache.spark.sql.sedona_sql.expressions.ST_Distance**   < 2.0) ST_INTERSECTS(pointshape1#232, pointshape2#257)\n",
       ":- SpatialIndex pointshape1#232: geometry, RTREE, false, false, 2.0\n",
       ":  +- Project [ **org.apache.spark.sql.sedona_sql.expressions.ST_Point**   AS pointshape1#232, abc AS name1#233]\n",
       ":     +- FileScan csv [_c0#228,_c1#229] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/jovyan/docs/usecases/data/testpoint.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<_c0:string,_c1:string>\n",
       "+- Project [ **org.apache.spark.sql.sedona_sql.expressions.ST_Point**   AS pointshape2#257, def AS name2#258]\n",
       "   +- FileScan csv [_c0#253,_c1#254] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/jovyan/docs/usecases/data/testpoint.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<_c0:string,_c1:string>\n",
       "\n",
       "\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
       "                                                                                \r"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "+-----------------+-----+-----------------+-----+\n",
       "|      pointshape1|name1|      pointshape2|name2|\n",
       "+-----------------+-----+-----------------+-----+\n",
       "|POINT (1.1 101.1)|  abc|POINT (1.1 101.1)|  def|\n",
       "|POINT (2.1 102.1)|  abc|POINT (1.1 101.1)|  def|\n",
       "|POINT (1.1 101.1)|  abc|POINT (2.1 102.1)|  def|\n",
       "|POINT (2.1 102.1)|  abc|POINT (2.1 102.1)|  def|\n",
       "|POINT (3.1 103.1)|  abc|POINT (2.1 102.1)|  def|\n",
       "+-----------------+-----+-----------------+-----+\n",
       "only showing top 5 rows\n",
       "\n"
      ]
     }
    ],
    "source": [
     "point_csv_df_1 = (\n",
     "    sedona.read.format(\"csv\")\n",
     "    .option(\"delimiter\", \",\")\n",
     "    .option(\"header\", \"false\")\n",
     "    .load(\"data/testpoint.csv\")\n",
     ")\n",
     "\n",
     "point_csv_df_1.createOrReplaceTempView(\"pointtable\")\n",
     "\n",
     "point_df1 = sedona.sql(\n",
     "    \"SELECT ST_Point(cast(pointtable._c0 as Decimal(24,20)),cast(pointtable._c1 as Decimal(24,20))) as pointshape1, 'abc' as name1 from pointtable\"\n",
     ")\n",
     "point_df1.createOrReplaceTempView(\"pointdf1\")\n",
     "\n",
     "point_csv_df2 = (\n",
     "    sedona.read.format(\"csv\")\n",
     "    .option(\"delimiter\", \",\")\n",
     "    .option(\"header\", \"false\")\n",
     "    .load(\"data/testpoint.csv\")\n",
     ")\n",
     "\n",
     "point_csv_df2.createOrReplaceTempView(\"pointtable\")\n",
     "point_df2 = sedona.sql(\n",
     "    \"select ST_Point(cast(pointtable._c0 as Decimal(24,20)),cast(pointtable._c1 as Decimal(24,20))) as pointshape2, 'def' as name2 from pointtable\"\n",
     ")\n",
     "point_df2.createOrReplaceTempView(\"pointdf2\")\n",
     "\n",
     "distance_join_df = sedona.sql(\n",
     "    \"select * from pointdf1, pointdf2 where ST_Distance(pointdf1.pointshape1,pointdf2.pointshape2) < 2\"\n",
     ")\n",
     "distance_join_df.explain()\n",
     "distance_join_df.show(5)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "### Spatial Join - Range Join and RDD API Join\n",
     "\n",
     "Please refer to the example - airports per country: https://github.com/apache/sedona/blob/master/docs/usecases/ApacheSedonaSQL_SpatialJoin_AirportsPerCountry.ipynb"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "### Converting GeoPandas to Apache Sedona"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
     "import pandas as pd\n",
     "\n",
     "gdf = gpd.read_file(\"data/gis_osm_pois_free_1.shp\")\n",
     "gdf = gdf.replace(pd.NA, \"\")\n",
     "osm_points = sedona.createDataFrame(gdf)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "root\n",
       " |-- osm_id: string (nullable = true)\n",
       " |-- code: long (nullable = true)\n",
       " |-- fclass: string (nullable = true)\n",
       " |-- name: string (nullable = true)\n",
       " |-- geometry: geometry (nullable = true)\n",
       "\n"
      ]
     }
    ],
    "source": [
     "osm_points.printSchema()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
       "[Stage 15:>                                                         (0 + 1) / 1]\r"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "+--------+----+---------+--------------+--------------------+\n",
       "|  osm_id|code|   fclass|          name|            geometry|\n",
       "+--------+----+---------+--------------+--------------------+\n",
       "|26860257|2422|camp_site|      de Kroon|POINT (15.3393145...|\n",
       "|26860294|2406|   chalet|Leśne Ustronie|POINT (14.8709625...|\n",
       "|29947493|2402|    motel|              |POINT (15.0946636...|\n",
       "|29947498|2602|      atm|              |POINT (15.0732014...|\n",
       "|29947499|2401|    hotel|              |POINT (15.0696777...|\n",
       "+--------+----+---------+--------------+--------------------+\n",
       "only showing top 5 rows\n",
       "\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
       "                                                                                \r"
      ]
     }
    ],
    "source": [
     "osm_points.show(5)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
     "osm_points.createOrReplaceTempView(\"points\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
     "transformed_df = sedona.sql(\n",
     "    \"\"\"\n",
     "        SELECT osm_id,\n",
     "               code,\n",
     "               fclass,\n",
     "               name,\n",
     "               ST_Transform(geometry, 'epsg:4326', 'epsg:2180') as geom \n",
     "        FROM points\n",
     "    \"\"\"\n",
     ")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
       "[Stage 16:>                                                         (0 + 1) / 1]\r"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "+--------+----+---------+--------------+--------------------+\n",
       "|  osm_id|code|   fclass|          name|                geom|\n",
       "+--------+----+---------+--------------+--------------------+\n",
       "|26860257|2422|camp_site|      de Kroon|POINT (250776.778...|\n",
       "|26860294|2406|   chalet|Leśne Ustronie|POINT (221076.709...|\n",
       "|29947493|2402|    motel|              |POINT (233902.541...|\n",
       "|29947498|2602|      atm|              |POINT (232447.203...|\n",
       "|29947499|2401|    hotel|              |POINT (232208.377...|\n",
       "+--------+----+---------+--------------+--------------------+\n",
       "only showing top 5 rows\n",
       "\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
       "                                                                                \r"
      ]
     }
    ],
    "source": [
     "transformed_df.show(5)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
     "transformed_df.createOrReplaceTempView(\"points_2180\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
     "neighbours_within_1000m = sedona.sql(\n",
     "    \"\"\"\n",
     "        SELECT a.osm_id AS id_1,\n",
     "               b.osm_id AS id_2,\n",
     "               a.geom \n",
     "        FROM points_2180 AS a, points_2180 AS b \n",
     "        WHERE ST_Distance(a.geom,b.geom) < 50\n",
     "    \"\"\"\n",
     ")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
       "24/05/22 18:07:13 WARN JoinQuery: UseIndex is true, but no index exists. Will build index on the fly.\n",
       "[Stage 20:>                                                         (0 + 1) / 1]\r"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "+----------+---------+--------------------+\n",
       "|      id_1|     id_2|                geom|\n",
       "+----------+---------+--------------------+\n",
       "|  26860257| 26860257|POINT (250776.778...|\n",
       "|  26860294| 26860294|POINT (221076.709...|\n",
       "|  29947493| 29947493|POINT (233902.541...|\n",
       "|3241834852| 29947493|POINT (233866.098...|\n",
       "|5964811085| 29947493|POINT (233861.172...|\n",
       "|5818905324| 29947498|POINT (232446.535...|\n",
       "|4165181885| 29947498|POINT (232449.441...|\n",
       "|5846858758| 29947498|POINT (232407.167...|\n",
       "|  29947498| 29947498|POINT (232447.203...|\n",
       "|  29947499| 29947499|POINT (232208.377...|\n",
       "|  30077461| 29947499|POINT (232185.872...|\n",
       "|  29947505| 29947505|POINT (228595.321...|\n",
       "|  29947499| 30077461|POINT (232208.377...|\n",
       "|  30077461| 30077461|POINT (232185.872...|\n",
       "|  30079117| 30079117|POINT (273599.241...|\n",
       "| 197624402|197624402|POINT (203703.035...|\n",
       "| 197663196|197663196|POINT (203936.327...|\n",
       "| 197953474|197953474|POINT (203724.746...|\n",
       "|1074233127|262310516|POINT (203524.110...|\n",
       "| 262310516|262310516|POINT (203507.730...|\n",
       "+----------+---------+--------------------+\n",
       "only showing top 20 rows\n",
       "\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
       "                                                                                \r"
      ]
     }
    ],
    "source": [
     "neighbours_within_1000m.show()"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "## Converting Apache Sedona to GeoPandas"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
       "24/05/22 18:07:17 WARN JoinQuery: UseIndex is true, but no index exists. Will build index on the fly.\n",
       "                                                                                \r"
      ]
     }
    ],
    "source": [
     "df = neighbours_within_1000m.toPandas()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 18,
    "metadata": {},
    "outputs": [],
    "source": [
     "gdf = gpd.GeoDataFrame(df, geometry=\"geom\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 19,
    "metadata": {
     "tags": []
    },
    "outputs": [
     {
      "data": {
       "text/html": [
        "<div>\n",
        "<style scoped>\n",
        "    .dataframe tbody tr th:only-of-type {\n",
        "        vertical-align: middle;\n",
        "    }\n",
        "\n",
        "    .dataframe tbody tr th {\n",
        "        vertical-align: top;\n",
        "    }\n",
        "\n",
        "    .dataframe thead th {\n",
        "        text-align: right;\n",
        "    }\n",
        "</style>\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>id_1</th>\n",
        "      <th>id_2</th>\n",
        "      <th>geom</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>26860257</td>\n",
        "      <td>26860257</td>\n",
        "      <td>POINT (250776.778 504581.332)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>26860294</td>\n",
        "      <td>26860294</td>\n",
        "      <td>POINT (221076.710 544222.650)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>29947493</td>\n",
        "      <td>29947493</td>\n",
        "      <td>POINT (233902.541 501298.382)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>3241834852</td>\n",
        "      <td>29947493</td>\n",
        "      <td>POINT (233866.099 501323.801)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>5964811085</td>\n",
        "      <td>29947493</td>\n",
        "      <td>POINT (233861.173 501326.441)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>...</th>\n",
        "      <td>...</td>\n",
        "      <td>...</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>65670</th>\n",
        "      <td>6823696220</td>\n",
        "      <td>6823696220</td>\n",
        "      <td>POINT (234310.474 465790.364)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>65671</th>\n",
        "      <td>6823721834</td>\n",
        "      <td>6823721834</td>\n",
        "      <td>POINT (234313.567 465869.023)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>65672</th>\n",
        "      <td>6823721838</td>\n",
        "      <td>6823721838</td>\n",
        "      <td>POINT (234425.224 465829.994)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>65673</th>\n",
        "      <td>6823772928</td>\n",
        "      <td>6823772928</td>\n",
        "      <td>POINT (234582.864 465875.142)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>65674</th>\n",
        "      <td>6823823445</td>\n",
        "      <td>6823823445</td>\n",
        "      <td>POINT (236859.419 428547.388)</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>65675 rows × 3 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
        "             id_1        id_2                           geom\n",
        "0        26860257    26860257  POINT (250776.778 504581.332)\n",
        "1        26860294    26860294  POINT (221076.710 544222.650)\n",
        "2        29947493    29947493  POINT (233902.541 501298.382)\n",
        "3      3241834852    29947493  POINT (233866.099 501323.801)\n",
        "4      5964811085    29947493  POINT (233861.173 501326.441)\n",
        "...           ...         ...                            ...\n",
        "65670  6823696220  6823696220  POINT (234310.474 465790.364)\n",
        "65671  6823721834  6823721834  POINT (234313.567 465869.023)\n",
        "65672  6823721838  6823721838  POINT (234425.224 465829.994)\n",
        "65673  6823772928  6823772928  POINT (234582.864 465875.142)\n",
        "65674  6823823445  6823823445  POINT (236859.419 428547.388)\n",
        "\n",
        "[65675 rows x 3 columns]"
       ]
      },
      "execution_count": 19,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "gdf"
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
     "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.9.18"
   }
  },
  "nbformat": 4,
  "nbformat_minor": 4
 }