| { |
| "cells": [ |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "```\n", |
| "Licensed to the Apache Software Foundation (ASF) under one\n", |
| "or more contributor license agreements. See the NOTICE file\n", |
| "distributed with this work for additional information\n", |
| "regarding copyright ownership. The ASF licenses this file\n", |
| "to you under the Apache License, Version 2.0 (the\n", |
| "\"License\"); you may not use this file except in compliance\n", |
| "with the License. You may obtain a copy of the License at\n", |
| " http://www.apache.org/licenses/LICENSE-2.0\n", |
| "Unless required by applicable law or agreed to in writing,\n", |
| "software distributed under the License is distributed on an\n", |
| "\"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY\n", |
| "KIND, either express or implied. See the License for the\n", |
| "specific language governing permissions and limitations\n", |
| "under the License.\n", |
| "```" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 1, |
| "metadata": { |
| "tags": [] |
| }, |
| "outputs": [], |
| "source": [ |
| "import os\n", |
| "\n", |
| "import geopandas as gpd\n", |
| "from pyspark.sql import SparkSession\n", |
| "\n", |
| "from sedona.spark import *" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": { |
| "tags": [] |
| }, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| ":: loading settings :: url = jar:file:/home/jovyan/spark-3.4.2-bin-hadoop3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml\n" |
| ] |
| }, |
| { |
| "name": "stderr", |
| "output_type": "stream", |
| "text": [ |
| "Ivy Default Cache set to: /home/jovyan/.ivy2/cache\n", |
| "The jars for the packages stored in: /home/jovyan/.ivy2/jars\n", |
| "org.apache.sedona#sedona-spark-3.4_2.12 added as a dependency\n", |
| "org.datasyslab#geotools-wrapper added as a dependency\n", |
| "uk.co.gresearch.spark#spark-extension_2.12 added as a dependency\n", |
| ":: resolving dependencies :: org.apache.spark#spark-submit-parent-713da225-9e04-45e3-9233-485d4b212b3c;1.0\n", |
| "\tconfs: [default]\n", |
| "\tfound org.apache.sedona#sedona-spark-3.4_2.12;1.6.0 in central\n", |
| "\tfound org.apache.sedona#sedona-common;1.6.0 in central\n", |
| "\tfound org.apache.commons#commons-math3;3.6.1 in central\n", |
| "\tfound org.locationtech.jts#jts-core;1.19.0 in central\n", |
| "\tfound org.wololo#jts2geojson;0.16.1 in central\n", |
| "\tfound org.locationtech.spatial4j#spatial4j;0.8 in central\n", |
| "\tfound com.google.geometry#s2-geometry;2.0.0 in central\n", |
| "\tfound com.google.guava#guava;25.1-jre in central\n", |
| "\tfound com.google.code.findbugs#jsr305;3.0.2 in central\n", |
| "\tfound org.checkerframework#checker-qual;2.0.0 in central\n", |
| "\tfound com.google.errorprone#error_prone_annotations;2.1.3 in central\n", |
| "\tfound com.google.j2objc#j2objc-annotations;1.1 in central\n", |
| "\tfound org.codehaus.mojo#animal-sniffer-annotations;1.14 in central\n", |
| "\tfound com.uber#h3;4.1.1 in central\n", |
| "\tfound net.sf.geographiclib#GeographicLib-Java;1.52 in central\n", |
| "\tfound com.github.ben-manes.caffeine#caffeine;2.9.2 in central\n", |
| "\tfound org.checkerframework#checker-qual;3.10.0 in central\n", |
| "\tfound com.google.errorprone#error_prone_annotations;2.5.1 in central\n", |
| "\tfound org.apache.sedona#sedona-spark-common-3.4_2.12;1.6.0 in central\n", |
| "\tfound commons-lang#commons-lang;2.6 in central\n", |
| "\tfound org.scala-lang.modules#scala-collection-compat_2.12;2.5.0 in central\n", |
| "\tfound org.beryx#awt-color-factory;1.0.0 in central\n", |
| "\tfound org.datasyslab#geotools-wrapper;1.6.0-28.2 in central\n", |
| "\tfound uk.co.gresearch.spark#spark-extension_2.12;2.11.0-3.4 in central\n", |
| "\tfound com.github.scopt#scopt_2.12;4.1.0 in central\n", |
| ":: resolution report :: resolve 2280ms :: artifacts dl 96ms\n", |
| "\t:: modules in use:\n", |
| "\tcom.github.ben-manes.caffeine#caffeine;2.9.2 from central in [default]\n", |
| "\tcom.github.scopt#scopt_2.12;4.1.0 from central in [default]\n", |
| "\tcom.google.code.findbugs#jsr305;3.0.2 from central in [default]\n", |
| "\tcom.google.errorprone#error_prone_annotations;2.5.1 from central in [default]\n", |
| "\tcom.google.geometry#s2-geometry;2.0.0 from central in [default]\n", |
| "\tcom.google.guava#guava;25.1-jre from central in [default]\n", |
| "\tcom.google.j2objc#j2objc-annotations;1.1 from central in [default]\n", |
| "\tcom.uber#h3;4.1.1 from central in [default]\n", |
| "\tcommons-lang#commons-lang;2.6 from central in [default]\n", |
| "\tnet.sf.geographiclib#GeographicLib-Java;1.52 from central in [default]\n", |
| "\torg.apache.commons#commons-math3;3.6.1 from central in [default]\n", |
| "\torg.apache.sedona#sedona-common;1.6.0 from central in [default]\n", |
| "\torg.apache.sedona#sedona-spark-3.4_2.12;1.6.0 from central in [default]\n", |
| "\torg.apache.sedona#sedona-spark-common-3.4_2.12;1.6.0 from central in [default]\n", |
| "\torg.beryx#awt-color-factory;1.0.0 from central in [default]\n", |
| "\torg.checkerframework#checker-qual;3.10.0 from central in [default]\n", |
| "\torg.codehaus.mojo#animal-sniffer-annotations;1.14 from central in [default]\n", |
| "\torg.datasyslab#geotools-wrapper;1.6.0-28.2 from central in [default]\n", |
| "\torg.locationtech.jts#jts-core;1.19.0 from central in [default]\n", |
| "\torg.locationtech.spatial4j#spatial4j;0.8 from central in [default]\n", |
| "\torg.scala-lang.modules#scala-collection-compat_2.12;2.5.0 from central in [default]\n", |
| "\torg.wololo#jts2geojson;0.16.1 from central in [default]\n", |
| "\tuk.co.gresearch.spark#spark-extension_2.12;2.11.0-3.4 from central in [default]\n", |
| "\t:: evicted modules:\n", |
| "\torg.checkerframework#checker-qual;2.0.0 by [org.checkerframework#checker-qual;3.10.0] in [default]\n", |
| "\tcom.google.errorprone#error_prone_annotations;2.1.3 by [com.google.errorprone#error_prone_annotations;2.5.1] in [default]\n", |
| "\t---------------------------------------------------------------------\n", |
| "\t| | modules || artifacts |\n", |
| "\t| conf | number| search|dwnlded|evicted|| number|dwnlded|\n", |
| "\t---------------------------------------------------------------------\n", |
| "\t| default | 25 | 0 | 0 | 2 || 23 | 0 |\n", |
| "\t---------------------------------------------------------------------\n", |
| ":: retrieving :: org.apache.spark#spark-submit-parent-713da225-9e04-45e3-9233-485d4b212b3c\n", |
| "\tconfs: [default]\n", |
| "\t0 artifacts copied, 23 already retrieved (0kB/88ms)\n", |
| "24/05/22 18:06:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", |
| "Setting default log level to \"WARN\".\n", |
| "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", |
| " \r" |
| ] |
| } |
| ], |
| "source": [ |
| "config = (\n", |
| " SedonaContext.builder()\n", |
| " .config(\n", |
| " \"spark.jars.packages\",\n", |
| " \"uk.co.gresearch.spark:spark-extension_2.12:2.11.0-3.4\",\n", |
| " )\n", |
| " .master(\"spark://localhost:7077\")\n", |
| " .getOrCreate()\n", |
| ")\n", |
| "\n", |
| "sedona = SedonaContext.create(config)" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "## Geometry Constructors" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "### ST_Point" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 3, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stderr", |
| "output_type": "stream", |
| "text": [ |
| "[Stage 4:> (0 + 1) / 1]\r" |
| ] |
| }, |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "+-----------------+\n", |
| "| arealandmark|\n", |
| "+-----------------+\n", |
| "|POINT (1.1 101.1)|\n", |
| "|POINT (2.1 102.1)|\n", |
| "|POINT (3.1 103.1)|\n", |
| "|POINT (4.1 104.1)|\n", |
| "|POINT (5.1 105.1)|\n", |
| "+-----------------+\n", |
| "only showing top 5 rows\n", |
| "\n" |
| ] |
| }, |
| { |
| "name": "stderr", |
| "output_type": "stream", |
| "text": [ |
| " \r" |
| ] |
| } |
| ], |
| "source": [ |
| "point_csv_df = (\n", |
| " sedona.read.format(\"csv\")\n", |
| " .option(\"delimiter\", \",\")\n", |
| " .option(\"header\", \"false\")\n", |
| " .load(\"data/testpoint.csv\")\n", |
| ")\n", |
| "\n", |
| "point_csv_df.createOrReplaceTempView(\"pointtable\")\n", |
| "\n", |
| "point_df = sedona.sql(\n", |
| " \"select ST_Point(cast(pointtable._c0 as Decimal(24,20)), cast(pointtable._c1 as Decimal(24,20))) as arealandmark from pointtable\"\n", |
| ")\n", |
| "point_df.show(5)" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "### ST_GeomFromText" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 4, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "+----------------+--------------------+\n", |
| "| name| countyshape|\n", |
| "+----------------+--------------------+\n", |
| "| Cuming County|POLYGON ((-97.019...|\n", |
| "|Wahkiakum County|POLYGON ((-123.43...|\n", |
| "| De Baca County|POLYGON ((-104.56...|\n", |
| "|Lancaster County|POLYGON ((-96.910...|\n", |
| "| Nuckolls County|POLYGON ((-98.273...|\n", |
| "+----------------+--------------------+\n", |
| "only showing top 5 rows\n", |
| "\n" |
| ] |
| } |
| ], |
| "source": [ |
| "polygon_wkt_df = (\n", |
| " sedona.read.format(\"csv\")\n", |
| " .option(\"delimiter\", \"\\t\")\n", |
| " .option(\"header\", \"false\")\n", |
| " .load(\"data/county_small.tsv\")\n", |
| ")\n", |
| "\n", |
| "polygon_wkt_df.createOrReplaceTempView(\"polygontable\")\n", |
| "polygon_df = sedona.sql(\n", |
| " \"select polygontable._c6 as name, ST_GeomFromText(polygontable._c0) as countyshape from polygontable\"\n", |
| ")\n", |
| "polygon_df.show(5)" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "### ST_GeomFromWKB" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 5, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "+----------------+--------------------+\n", |
| "| name| countyshape|\n", |
| "+----------------+--------------------+\n", |
| "| Cuming County|POLYGON ((-97.019...|\n", |
| "|Wahkiakum County|POLYGON ((-123.43...|\n", |
| "| De Baca County|POLYGON ((-104.56...|\n", |
| "|Lancaster County|POLYGON ((-96.910...|\n", |
| "| Nuckolls County|POLYGON ((-98.273...|\n", |
| "+----------------+--------------------+\n", |
| "only showing top 5 rows\n", |
| "\n" |
| ] |
| } |
| ], |
| "source": [ |
| "polygon_wkb_df = (\n", |
| " sedona.read.format(\"csv\")\n", |
| " .option(\"delimiter\", \"\\t\")\n", |
| " .option(\"header\", \"false\")\n", |
| " .load(\"data/county_small_wkb.tsv\")\n", |
| ")\n", |
| "\n", |
| "polygon_wkb_df.createOrReplaceTempView(\"polygontable\")\n", |
| "polygon_df = sedona.sql(\n", |
| " \"select polygontable._c6 as name, ST_GeomFromWKB(polygontable._c0) as countyshape from polygontable\"\n", |
| ")\n", |
| "polygon_df.show(5)" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "### ST_GeomFromGeoJSON" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 6, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "+--------------------+\n", |
| "| countyshape|\n", |
| "+--------------------+\n", |
| "|POLYGON ((-87.621...|\n", |
| "|POLYGON ((-85.719...|\n", |
| "|POLYGON ((-86.000...|\n", |
| "|POLYGON ((-86.574...|\n", |
| "|POLYGON ((-85.382...|\n", |
| "+--------------------+\n", |
| "only showing top 5 rows\n", |
| "\n" |
| ] |
| } |
| ], |
| "source": [ |
| "polygon_json_df = (\n", |
| " sedona.read.format(\"csv\")\n", |
| " .option(\"delimiter\", \"\\t\")\n", |
| " .option(\"header\", \"false\")\n", |
| " .load(\"data/testPolygon.json\")\n", |
| ")\n", |
| "\n", |
| "polygon_json_df.createOrReplaceTempView(\"polygontable\")\n", |
| "polygon_df = sedona.sql(\n", |
| " \"select ST_GeomFromGeoJSON(polygontable._c0) as countyshape from polygontable\"\n", |
| ")\n", |
| "polygon_df.show(5)" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "## Spatial Operations" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "### Spatial Join - Distance Join" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 7, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "== Physical Plan ==\n", |
| "BroadcastIndexJoin pointshape2#257: geometry, LeftSide, LeftSide, Inner, INTERSECTS, ( **org.apache.spark.sql.sedona_sql.expressions.ST_Distance** < 2.0) ST_INTERSECTS(pointshape1#232, pointshape2#257)\n", |
| ":- SpatialIndex pointshape1#232: geometry, RTREE, false, false, 2.0\n", |
| ": +- Project [ **org.apache.spark.sql.sedona_sql.expressions.ST_Point** AS pointshape1#232, abc AS name1#233]\n", |
| ": +- FileScan csv [_c0#228,_c1#229] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/jovyan/docs/usecases/data/testpoint.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<_c0:string,_c1:string>\n", |
| "+- Project [ **org.apache.spark.sql.sedona_sql.expressions.ST_Point** AS pointshape2#257, def AS name2#258]\n", |
| " +- FileScan csv [_c0#253,_c1#254] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/jovyan/docs/usecases/data/testpoint.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<_c0:string,_c1:string>\n", |
| "\n", |
| "\n" |
| ] |
| }, |
| { |
| "name": "stderr", |
| "output_type": "stream", |
| "text": [ |
| " \r" |
| ] |
| }, |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "+-----------------+-----+-----------------+-----+\n", |
| "| pointshape1|name1| pointshape2|name2|\n", |
| "+-----------------+-----+-----------------+-----+\n", |
| "|POINT (1.1 101.1)| abc|POINT (1.1 101.1)| def|\n", |
| "|POINT (2.1 102.1)| abc|POINT (1.1 101.1)| def|\n", |
| "|POINT (1.1 101.1)| abc|POINT (2.1 102.1)| def|\n", |
| "|POINT (2.1 102.1)| abc|POINT (2.1 102.1)| def|\n", |
| "|POINT (3.1 103.1)| abc|POINT (2.1 102.1)| def|\n", |
| "+-----------------+-----+-----------------+-----+\n", |
| "only showing top 5 rows\n", |
| "\n" |
| ] |
| } |
| ], |
| "source": [ |
| "point_csv_df_1 = (\n", |
| " sedona.read.format(\"csv\")\n", |
| " .option(\"delimiter\", \",\")\n", |
| " .option(\"header\", \"false\")\n", |
| " .load(\"data/testpoint.csv\")\n", |
| ")\n", |
| "\n", |
| "point_csv_df_1.createOrReplaceTempView(\"pointtable\")\n", |
| "\n", |
| "point_df1 = sedona.sql(\n", |
| " \"SELECT ST_Point(cast(pointtable._c0 as Decimal(24,20)),cast(pointtable._c1 as Decimal(24,20))) as pointshape1, 'abc' as name1 from pointtable\"\n", |
| ")\n", |
| "point_df1.createOrReplaceTempView(\"pointdf1\")\n", |
| "\n", |
| "point_csv_df2 = (\n", |
| " sedona.read.format(\"csv\")\n", |
| " .option(\"delimiter\", \",\")\n", |
| " .option(\"header\", \"false\")\n", |
| " .load(\"data/testpoint.csv\")\n", |
| ")\n", |
| "\n", |
| "point_csv_df2.createOrReplaceTempView(\"pointtable\")\n", |
| "point_df2 = sedona.sql(\n", |
| " \"select ST_Point(cast(pointtable._c0 as Decimal(24,20)),cast(pointtable._c1 as Decimal(24,20))) as pointshape2, 'def' as name2 from pointtable\"\n", |
| ")\n", |
| "point_df2.createOrReplaceTempView(\"pointdf2\")\n", |
| "\n", |
| "distance_join_df = sedona.sql(\n", |
| " \"select * from pointdf1, pointdf2 where ST_Distance(pointdf1.pointshape1,pointdf2.pointshape2) < 2\"\n", |
| ")\n", |
| "distance_join_df.explain()\n", |
| "distance_join_df.show(5)" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "### Spatial Join - Range Join and RDD API Join\n", |
| "\n", |
| "Please refer to the example - airports per country: https://github.com/apache/sedona/blob/master/docs/usecases/ApacheSedonaSQL_SpatialJoin_AirportsPerCountry.ipynb" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "### Converting GeoPandas to Apache Sedona" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 8, |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "import pandas as pd\n", |
| "\n", |
| "gdf = gpd.read_file(\"data/gis_osm_pois_free_1.shp\")\n", |
| "gdf = gdf.replace(pd.NA, \"\")\n", |
| "osm_points = sedona.createDataFrame(gdf)" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 9, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "root\n", |
| " |-- osm_id: string (nullable = true)\n", |
| " |-- code: long (nullable = true)\n", |
| " |-- fclass: string (nullable = true)\n", |
| " |-- name: string (nullable = true)\n", |
| " |-- geometry: geometry (nullable = true)\n", |
| "\n" |
| ] |
| } |
| ], |
| "source": [ |
| "osm_points.printSchema()" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 10, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stderr", |
| "output_type": "stream", |
| "text": [ |
| "[Stage 15:> (0 + 1) / 1]\r" |
| ] |
| }, |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "+--------+----+---------+--------------+--------------------+\n", |
| "| osm_id|code| fclass| name| geometry|\n", |
| "+--------+----+---------+--------------+--------------------+\n", |
| "|26860257|2422|camp_site| de Kroon|POINT (15.3393145...|\n", |
| "|26860294|2406| chalet|Leśne Ustronie|POINT (14.8709625...|\n", |
| "|29947493|2402| motel| |POINT (15.0946636...|\n", |
| "|29947498|2602| atm| |POINT (15.0732014...|\n", |
| "|29947499|2401| hotel| |POINT (15.0696777...|\n", |
| "+--------+----+---------+--------------+--------------------+\n", |
| "only showing top 5 rows\n", |
| "\n" |
| ] |
| }, |
| { |
| "name": "stderr", |
| "output_type": "stream", |
| "text": [ |
| " \r" |
| ] |
| } |
| ], |
| "source": [ |
| "osm_points.show(5)" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 11, |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "osm_points.createOrReplaceTempView(\"points\")" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 12, |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "transformed_df = sedona.sql(\n", |
| " \"\"\"\n", |
| " SELECT osm_id,\n", |
| " code,\n", |
| " fclass,\n", |
| " name,\n", |
| " ST_Transform(geometry, 'epsg:4326', 'epsg:2180') as geom \n", |
| " FROM points\n", |
| " \"\"\"\n", |
| ")" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 13, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stderr", |
| "output_type": "stream", |
| "text": [ |
| "[Stage 16:> (0 + 1) / 1]\r" |
| ] |
| }, |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "+--------+----+---------+--------------+--------------------+\n", |
| "| osm_id|code| fclass| name| geom|\n", |
| "+--------+----+---------+--------------+--------------------+\n", |
| "|26860257|2422|camp_site| de Kroon|POINT (250776.778...|\n", |
| "|26860294|2406| chalet|Leśne Ustronie|POINT (221076.709...|\n", |
| "|29947493|2402| motel| |POINT (233902.541...|\n", |
| "|29947498|2602| atm| |POINT (232447.203...|\n", |
| "|29947499|2401| hotel| |POINT (232208.377...|\n", |
| "+--------+----+---------+--------------+--------------------+\n", |
| "only showing top 5 rows\n", |
| "\n" |
| ] |
| }, |
| { |
| "name": "stderr", |
| "output_type": "stream", |
| "text": [ |
| " \r" |
| ] |
| } |
| ], |
| "source": [ |
| "transformed_df.show(5)" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 14, |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "transformed_df.createOrReplaceTempView(\"points_2180\")" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 15, |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "neighbours_within_1000m = sedona.sql(\n", |
| " \"\"\"\n", |
| " SELECT a.osm_id AS id_1,\n", |
| " b.osm_id AS id_2,\n", |
| " a.geom \n", |
| " FROM points_2180 AS a, points_2180 AS b \n", |
| " WHERE ST_Distance(a.geom,b.geom) < 50\n", |
| " \"\"\"\n", |
| ")" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 16, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stderr", |
| "output_type": "stream", |
| "text": [ |
| "24/05/22 18:07:13 WARN JoinQuery: UseIndex is true, but no index exists. Will build index on the fly.\n", |
| "[Stage 20:> (0 + 1) / 1]\r" |
| ] |
| }, |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "+----------+---------+--------------------+\n", |
| "| id_1| id_2| geom|\n", |
| "+----------+---------+--------------------+\n", |
| "| 26860257| 26860257|POINT (250776.778...|\n", |
| "| 26860294| 26860294|POINT (221076.709...|\n", |
| "| 29947493| 29947493|POINT (233902.541...|\n", |
| "|3241834852| 29947493|POINT (233866.098...|\n", |
| "|5964811085| 29947493|POINT (233861.172...|\n", |
| "|5818905324| 29947498|POINT (232446.535...|\n", |
| "|4165181885| 29947498|POINT (232449.441...|\n", |
| "|5846858758| 29947498|POINT (232407.167...|\n", |
| "| 29947498| 29947498|POINT (232447.203...|\n", |
| "| 29947499| 29947499|POINT (232208.377...|\n", |
| "| 30077461| 29947499|POINT (232185.872...|\n", |
| "| 29947505| 29947505|POINT (228595.321...|\n", |
| "| 29947499| 30077461|POINT (232208.377...|\n", |
| "| 30077461| 30077461|POINT (232185.872...|\n", |
| "| 30079117| 30079117|POINT (273599.241...|\n", |
| "| 197624402|197624402|POINT (203703.035...|\n", |
| "| 197663196|197663196|POINT (203936.327...|\n", |
| "| 197953474|197953474|POINT (203724.746...|\n", |
| "|1074233127|262310516|POINT (203524.110...|\n", |
| "| 262310516|262310516|POINT (203507.730...|\n", |
| "+----------+---------+--------------------+\n", |
| "only showing top 20 rows\n", |
| "\n" |
| ] |
| }, |
| { |
| "name": "stderr", |
| "output_type": "stream", |
| "text": [ |
| " \r" |
| ] |
| } |
| ], |
| "source": [ |
| "neighbours_within_1000m.show()" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "## Converting Apache Sedona to GeoPandas" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 17, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stderr", |
| "output_type": "stream", |
| "text": [ |
| "24/05/22 18:07:17 WARN JoinQuery: UseIndex is true, but no index exists. Will build index on the fly.\n", |
| " \r" |
| ] |
| } |
| ], |
| "source": [ |
| "df = neighbours_within_1000m.toPandas()" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 18, |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "gdf = gpd.GeoDataFrame(df, geometry=\"geom\")" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 19, |
| "metadata": { |
| "tags": [] |
| }, |
| "outputs": [ |
| { |
| "data": { |
| "text/html": [ |
| "<div>\n", |
| "<style scoped>\n", |
| " .dataframe tbody tr th:only-of-type {\n", |
| " vertical-align: middle;\n", |
| " }\n", |
| "\n", |
| " .dataframe tbody tr th {\n", |
| " vertical-align: top;\n", |
| " }\n", |
| "\n", |
| " .dataframe thead th {\n", |
| " text-align: right;\n", |
| " }\n", |
| "</style>\n", |
| "<table border=\"1\" class=\"dataframe\">\n", |
| " <thead>\n", |
| " <tr style=\"text-align: right;\">\n", |
| " <th></th>\n", |
| " <th>id_1</th>\n", |
| " <th>id_2</th>\n", |
| " <th>geom</th>\n", |
| " </tr>\n", |
| " </thead>\n", |
| " <tbody>\n", |
| " <tr>\n", |
| " <th>0</th>\n", |
| " <td>26860257</td>\n", |
| " <td>26860257</td>\n", |
| " <td>POINT (250776.778 504581.332)</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <th>1</th>\n", |
| " <td>26860294</td>\n", |
| " <td>26860294</td>\n", |
| " <td>POINT (221076.710 544222.650)</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <th>2</th>\n", |
| " <td>29947493</td>\n", |
| " <td>29947493</td>\n", |
| " <td>POINT (233902.541 501298.382)</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <th>3</th>\n", |
| " <td>3241834852</td>\n", |
| " <td>29947493</td>\n", |
| " <td>POINT (233866.099 501323.801)</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <th>4</th>\n", |
| " <td>5964811085</td>\n", |
| " <td>29947493</td>\n", |
| " <td>POINT (233861.173 501326.441)</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <th>...</th>\n", |
| " <td>...</td>\n", |
| " <td>...</td>\n", |
| " <td>...</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <th>65670</th>\n", |
| " <td>6823696220</td>\n", |
| " <td>6823696220</td>\n", |
| " <td>POINT (234310.474 465790.364)</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <th>65671</th>\n", |
| " <td>6823721834</td>\n", |
| " <td>6823721834</td>\n", |
| " <td>POINT (234313.567 465869.023)</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <th>65672</th>\n", |
| " <td>6823721838</td>\n", |
| " <td>6823721838</td>\n", |
| " <td>POINT (234425.224 465829.994)</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <th>65673</th>\n", |
| " <td>6823772928</td>\n", |
| " <td>6823772928</td>\n", |
| " <td>POINT (234582.864 465875.142)</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <th>65674</th>\n", |
| " <td>6823823445</td>\n", |
| " <td>6823823445</td>\n", |
| " <td>POINT (236859.419 428547.388)</td>\n", |
| " </tr>\n", |
| " </tbody>\n", |
| "</table>\n", |
| "<p>65675 rows × 3 columns</p>\n", |
| "</div>" |
| ], |
| "text/plain": [ |
| " id_1 id_2 geom\n", |
| "0 26860257 26860257 POINT (250776.778 504581.332)\n", |
| "1 26860294 26860294 POINT (221076.710 544222.650)\n", |
| "2 29947493 29947493 POINT (233902.541 501298.382)\n", |
| "3 3241834852 29947493 POINT (233866.099 501323.801)\n", |
| "4 5964811085 29947493 POINT (233861.173 501326.441)\n", |
| "... ... ... ...\n", |
| "65670 6823696220 6823696220 POINT (234310.474 465790.364)\n", |
| "65671 6823721834 6823721834 POINT (234313.567 465869.023)\n", |
| "65672 6823721838 6823721838 POINT (234425.224 465829.994)\n", |
| "65673 6823772928 6823772928 POINT (234582.864 465875.142)\n", |
| "65674 6823823445 6823823445 POINT (236859.419 428547.388)\n", |
| "\n", |
| "[65675 rows x 3 columns]" |
| ] |
| }, |
| "execution_count": 19, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "gdf" |
| ] |
| } |
| ], |
| "metadata": { |
| "kernelspec": { |
| "display_name": "Python 3 (ipykernel)", |
| "language": "python", |
| "name": "python3" |
| }, |
| "language_info": { |
| "codemirror_mode": { |
| "name": "ipython", |
| "version": 3 |
| }, |
| "file_extension": ".py", |
| "mimetype": "text/x-python", |
| "name": "python", |
| "nbconvert_exporter": "python", |
| "pygments_lexer": "ipython3", |
| "version": "3.9.18" |
| } |
| }, |
| "nbformat": 4, |
| "nbformat_minor": 4 |
| } |