diff --git a/docs/delta-lake.ipynb b/docs/delta-lake.ipynb new file mode 100644 index 000000000..131489736 --- /dev/null +++ b/docs/delta-lake.ipynb @@ -0,0 +1,306 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "3425268d-6430-4e52-9019-969d61ef5458", + "metadata": {}, + "source": [ + "# SedonaDB + Delta Lake\n", + "\n", + "This page shows how to read and write Delta Lake tables with SedonaDB.\n", + "\n", + "Make sure you run `pip install deltalake` to run the cells in this notebook." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "83f5ca25-8059-4624-bd00-e44cd172d9c2", + "metadata": {}, + "outputs": [], + "source": [ + "from deltalake import write_deltalake, DeltaTable\n", + "import sedona.db\n", + "\n", + "sd = sedona.db.connect()" + ] + }, + { + "cell_type": "markdown", + "id": "1a90b1bf-99f9-4f50-987b-f1f30bf9988a", + "metadata": {}, + "source": [ + "Read in a GeoParquet dataset into a SedonaDB DataFrame." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "a37f3a30-3267-4bfc-8e5b-e234053927af", + "metadata": {}, + "outputs": [], + "source": [ + "countries = sd.read_parquet(\n", + " \"https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0/natural-earth/files/natural-earth_countries_geo.parquet\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "1f82b047-8182-4120-82d7-945ff38ecbca", + "metadata": {}, + "source": [ + "## Create a Delta Lake table\n", + "\n", + "Now write the DataFrame to a Delta Lake table. Notice that you must convert the geometry column to Well-Known Text (WKT) before writing to the Delta table.\n", + "\n", + "Delta Lake does not support geometry columns." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "35bdc296-d9ef-4ea2-9ba5-dac1cfd115ef", + "metadata": {}, + "outputs": [], + "source": [ + "countries.to_view(\"countries\", True)\n", + "df = sd.sql(\n", + " \"select name, continent, ST_AsText(geometry) as geometry_wkt from countries\"\n", + ")\n", + "table_path = \"/tmp/delta_with_wkt\"\n", + "write_deltalake(table_path, df.to_pandas(), mode=\"overwrite\")" + ] + }, + { + "cell_type": "markdown", + "id": "974a5558-fb18-49b7-a304-fa93bdd363e8", + "metadata": {}, + "source": [ + "## Read Delta table into SedonaDB\n", + "\n", + "Now read the Delta table back into a SedonaDB DataFrame." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "c15d4605-483a-4041-973a-547098bdeef4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "┌─────────────────────────────┬───────────────┬────────────────────────────────────────────────────┐\n", + "│ name ┆ continent ┆ geometry_wkt │\n", + "│ utf8 ┆ utf8 ┆ utf8 │\n", + "╞═════════════════════════════╪═══════════════╪════════════════════════════════════════════════════╡\n", + "│ Fiji ┆ Oceania ┆ MULTIPOLYGON(((180 -16.067132663642447,180 -16.55… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ United Republic of Tanzania ┆ Africa ┆ POLYGON((33.90371119710453 -0.9500000000000001,34… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ Western Sahara ┆ Africa ┆ POLYGON((-8.665589565454809 27.656425889592356,-8… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ Canada ┆ North America ┆ MULTIPOLYGON(((-122.84000000000003 49.00000000000… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ United States of America ┆ North America ┆ MULTIPOLYGON(((-122.84000000000003 49.00000000000… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ Kazakhstan ┆ Asia ┆ POLYGON((87.35997033076265 49.21498078062912,86.5… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ Uzbekistan ┆ Asia ┆ POLYGON((55.96819135928291 41.30864166926936,55.9… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ Papua New Guinea ┆ Oceania ┆ MULTIPOLYGON(((141.00021040259185 -2.600151055515… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ Indonesia ┆ Asia ┆ MULTIPOLYGON(((141.00021040259185 -2.600151055515… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ Argentina ┆ South America ┆ MULTIPOLYGON(((-68.63401022758323 -52.63637045887… │\n", + "└─────────────────────────────┴───────────────┴────────────────────────────────────────────────────┘\n" + ] + } + ], + "source": [ + "dt = DeltaTable(table_path)\n", + "arrow_table = dt.to_pyarrow_table()\n", + "df = sd.create_data_frame(arrow_table)\n", + "df.show()" + ] + }, + { + "cell_type": "markdown", + "id": "a44fd9bd-85d2-4f1b-9df4-cc98ac6223f0", + "metadata": {}, + "source": [ + "Notice that the `geometry_wkt` column is `utf8`. It's not a geometry column.\n", + "\n", + "Let's convert the `geometry_wkt` column to a geometry column." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "9052c4af-4478-4038-81ae-cf9c0ba84345", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "┌─────────────────────────────┬───────────────┬────────────────────────────────────────────────────┐\n", + "│ name ┆ continent ┆ geom │\n", + "│ utf8 ┆ utf8 ┆ geometry │\n", + "╞═════════════════════════════╪═══════════════╪════════════════════════════════════════════════════╡\n", + "│ Fiji ┆ Oceania ┆ MULTIPOLYGON(((180 -16.067132663642447,180 -16.55… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ United Republic of Tanzania ┆ Africa ┆ POLYGON((33.90371119710453 -0.9500000000000001,34… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ Western Sahara ┆ Africa ┆ POLYGON((-8.665589565454809 27.656425889592356,-8… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ Canada ┆ North America ┆ MULTIPOLYGON(((-122.84000000000003 49.00000000000… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ United States of America ┆ North America ┆ MULTIPOLYGON(((-122.84000000000003 49.00000000000… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ Kazakhstan ┆ Asia ┆ POLYGON((87.35997033076265 49.21498078062912,86.5… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ Uzbekistan ┆ Asia ┆ POLYGON((55.96819135928291 41.30864166926936,55.9… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ Papua New Guinea ┆ Oceania ┆ MULTIPOLYGON(((141.00021040259185 -2.600151055515… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ Indonesia ┆ Asia ┆ MULTIPOLYGON(((141.00021040259185 -2.600151055515… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ Argentina ┆ South America ┆ MULTIPOLYGON(((-68.63401022758323 -52.63637045887… │\n", + "└─────────────────────────────┴───────────────┴────────────────────────────────────────────────────┘\n" + ] + } + ], + "source": [ + "df.to_view(\"my_table\", True)\n", + "res = sd.sql(\"\"\"\n", + "SELECT\n", + " name,\n", + " continent,\n", + " ST_GeomFromWKT(geometry_wkt) as geom\n", + "from my_table\n", + "\"\"\")\n", + "res.show()" + ] + }, + { + "cell_type": "markdown", + "id": "802be3a7-5b1d-4f02-85f9-9460743a921d", + "metadata": {}, + "source": [ + "Confirm the schema of the DataFrame." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "5758bfec-86c0-4a14-8e4e-201149a59fac", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "SedonaSchema with 3 fields:\n", + " name: utf8\n", + " continent: utf8\n", + " geom: geometry" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "res.schema" + ] + }, + { + "cell_type": "markdown", + "id": "7c975e52-5b2d-4ee9-a7c6-51cb3304c0d6", + "metadata": {}, + "source": [ + "## Filter countries in a particular geographic region\n", + "\n", + "Now, let's grab all the countries in the western half of South America using a polygon region.\n", + "\n", + "SedonaDB can run these types of queries on geometric data." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "c210e29a-170b-4a01-b464-c7d2f79998eb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "┌──────────────────┬───────────────┬───────────────────────────────────────────────────────────────┐\n", + "│ name ┆ continent ┆ geom │\n", + "│ utf8 ┆ utf8 ┆ geometry │\n", + "╞══════════════════╪═══════════════╪═══════════════════════════════════════════════════════════════╡\n", + "│ Argentina ┆ South America ┆ MULTIPOLYGON(((-68.63401022758323 -52.63637045887449,-68.25 … │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ Chile ┆ South America ┆ MULTIPOLYGON(((-68.63401022758323 -52.63637045887449,-68.633… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ Falkland Islands ┆ South America ┆ POLYGON((-61.2 -51.85,-60 -51.25,-59.15 -51.5,-58.5500000000… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ Uruguay ┆ South America ┆ POLYGON((-57.62513342958296 -30.21629485445426,-56.976025763… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ Brazil ┆ South America ┆ POLYGON((-53.373661668498244 -33.768377780900764,-53.6505439… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ Bolivia ┆ South America ┆ POLYGON((-69.52967810736496 -10.951734307502194,-68.78615759… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ Peru ┆ South America ┆ POLYGON((-69.89363521999663 -4.2981869441943275,-70.79476884… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ Colombia ┆ South America ┆ POLYGON((-66.87632585312258 1.253360500489336,-67.0650481838… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ Panama ┆ North America ┆ POLYGON((-77.35336076527386 8.67050466555807,-77.47472286651… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ Venezuela ┆ South America ┆ POLYGON((-60.73357418480372 5.200277207861901,-60.6011791652… │\n", + "└──────────────────┴───────────────┴───────────────────────────────────────────────────────────────┘\n" + ] + } + ], + "source": [ + "res = sd.sql(\"\"\"\n", + "SELECT\n", + " name,\n", + " continent,\n", + " ST_GeomFromWKT(geometry_wkt) as geom\n", + "FROM my_table\n", + "WHERE ST_Intersects(\n", + " ST_GeomFromWKT(geometry_wkt),\n", + " ST_GeomFromWKT('POLYGON((-81 12, -58 12, -58 -55, -81 -55, -81 12))')\n", + ")\n", + "\"\"\")\n", + "res.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/delta-lake.md b/docs/delta-lake.md new file mode 100644 index 000000000..b3afc6930 --- /dev/null +++ b/docs/delta-lake.md @@ -0,0 +1,202 @@ + + +# SedonaDB + Delta Lake + +This page shows how to read and write Delta Lake tables with SedonaDB. + +Make sure you run `pip install deltalake` to run the cells in this notebook. + + +```python +from deltalake import write_deltalake, DeltaTable +import sedona.db + +sd = sedona.db.connect() +``` + +Read in a GeoParquet dataset into a SedonaDB DataFrame. + + +```python +countries = sd.read_parquet( + "https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0/natural-earth/files/natural-earth_countries_geo.parquet" +) +``` + +## Create a Delta Lake table + +Now write the DataFrame to a Delta Lake table. Notice that you must convert the geometry column to Well-Known Text (WKT) before writing to the Delta table. + +Delta Lake does not support geometry columns. + + +```python +countries.to_view("countries", True) +df = sd.sql( + "select name, continent, ST_AsText(geometry) as geometry_wkt from countries" +) +table_path = "/tmp/delta_with_wkt" +write_deltalake(table_path, df.to_pandas(), mode="overwrite") +``` + +## Read Delta table into SedonaDB + +Now read the Delta table back into a SedonaDB DataFrame. + + +```python +dt = DeltaTable(table_path) +arrow_table = dt.to_pyarrow_table() +df = sd.create_data_frame(arrow_table) +df.show() +``` + + ┌─────────────────────────────┬───────────────┬────────────────────────────────────────────────────┐ + │ name ┆ continent ┆ geometry_wkt │ + │ utf8 ┆ utf8 ┆ utf8 │ + ╞═════════════════════════════╪═══════════════╪════════════════════════════════════════════════════╡ + │ Fiji ┆ Oceania ┆ MULTIPOLYGON(((180 -16.067132663642447,180 -16.55… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ United Republic of Tanzania ┆ Africa ┆ POLYGON((33.90371119710453 -0.9500000000000001,34… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ Western Sahara ┆ Africa ┆ POLYGON((-8.665589565454809 27.656425889592356,-8… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ Canada ┆ North America ┆ MULTIPOLYGON(((-122.84000000000003 49.00000000000… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ United States of America ┆ North America ┆ MULTIPOLYGON(((-122.84000000000003 49.00000000000… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ Kazakhstan ┆ Asia ┆ POLYGON((87.35997033076265 49.21498078062912,86.5… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ Uzbekistan ┆ Asia ┆ POLYGON((55.96819135928291 41.30864166926936,55.9… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ Papua New Guinea ┆ Oceania ┆ MULTIPOLYGON(((141.00021040259185 -2.600151055515… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ Indonesia ┆ Asia ┆ MULTIPOLYGON(((141.00021040259185 -2.600151055515… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ Argentina ┆ South America ┆ MULTIPOLYGON(((-68.63401022758323 -52.63637045887… │ + └─────────────────────────────┴───────────────┴────────────────────────────────────────────────────┘ + + +Notice that the `geometry_wkt` column is `utf8`. It's not a geometry column. + +Let's convert the `geometry_wkt` column to a geometry column. + + +```python +df.to_view("my_table", True) +res = sd.sql(""" +SELECT + name, + continent, + ST_GeomFromWKT(geometry_wkt) as geom +from my_table +""") +res.show() +``` + + ┌─────────────────────────────┬───────────────┬────────────────────────────────────────────────────┐ + │ name ┆ continent ┆ geom │ + │ utf8 ┆ utf8 ┆ geometry │ + ╞═════════════════════════════╪═══════════════╪════════════════════════════════════════════════════╡ + │ Fiji ┆ Oceania ┆ MULTIPOLYGON(((180 -16.067132663642447,180 -16.55… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ United Republic of Tanzania ┆ Africa ┆ POLYGON((33.90371119710453 -0.9500000000000001,34… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ Western Sahara ┆ Africa ┆ POLYGON((-8.665589565454809 27.656425889592356,-8… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ Canada ┆ North America ┆ MULTIPOLYGON(((-122.84000000000003 49.00000000000… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ United States of America ┆ North America ┆ MULTIPOLYGON(((-122.84000000000003 49.00000000000… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ Kazakhstan ┆ Asia ┆ POLYGON((87.35997033076265 49.21498078062912,86.5… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ Uzbekistan ┆ Asia ┆ POLYGON((55.96819135928291 41.30864166926936,55.9… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ Papua New Guinea ┆ Oceania ┆ MULTIPOLYGON(((141.00021040259185 -2.600151055515… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ Indonesia ┆ Asia ┆ MULTIPOLYGON(((141.00021040259185 -2.600151055515… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ Argentina ┆ South America ┆ MULTIPOLYGON(((-68.63401022758323 -52.63637045887… │ + └─────────────────────────────┴───────────────┴────────────────────────────────────────────────────┘ + + +Confirm the schema of the DataFrame. + + +```python +res.schema +``` + + + + + SedonaSchema with 3 fields: + name: utf8 + continent: utf8 + geom: geometry + + + +## Filter countries in a particular geographic region + +Now, let's grab all the countries in the western half of South America using a polygon region. + +SedonaDB can run these types of queries on geometric data. + + +```python +res = sd.sql(""" +SELECT + name, + continent, + ST_GeomFromWKT(geometry_wkt) as geom +FROM my_table +WHERE ST_Intersects( + ST_GeomFromWKT(geometry_wkt), + ST_GeomFromWKT('POLYGON((-81 12, -58 12, -58 -55, -81 -55, -81 12))') +) +""") +res.show() +``` + + ┌──────────────────┬───────────────┬───────────────────────────────────────────────────────────────┐ + │ name ┆ continent ┆ geom │ + │ utf8 ┆ utf8 ┆ geometry │ + ╞══════════════════╪═══════════════╪═══════════════════════════════════════════════════════════════╡ + │ Argentina ┆ South America ┆ MULTIPOLYGON(((-68.63401022758323 -52.63637045887449,-68.25 … │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ Chile ┆ South America ┆ MULTIPOLYGON(((-68.63401022758323 -52.63637045887449,-68.633… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ Falkland Islands ┆ South America ┆ POLYGON((-61.2 -51.85,-60 -51.25,-59.15 -51.5,-58.5500000000… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ Uruguay ┆ South America ┆ POLYGON((-57.62513342958296 -30.21629485445426,-56.976025763… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ Brazil ┆ South America ┆ POLYGON((-53.373661668498244 -33.768377780900764,-53.6505439… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ Bolivia ┆ South America ┆ POLYGON((-69.52967810736496 -10.951734307502194,-68.78615759… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ Peru ┆ South America ┆ POLYGON((-69.89363521999663 -4.2981869441943275,-70.79476884… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ Colombia ┆ South America ┆ POLYGON((-66.87632585312258 1.253360500489336,-67.0650481838… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ Panama ┆ North America ┆ POLYGON((-77.35336076527386 8.67050466555807,-77.47472286651… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ Venezuela ┆ South America ┆ POLYGON((-60.73357418480372 5.200277207861901,-60.6011791652… │ + └──────────────────┴───────────────┴───────────────────────────────────────────────────────────────┘ diff --git a/mkdocs.yml b/mkdocs.yml index 9f455255c..aaaf78f9b 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -41,16 +41,18 @@ nav: - Home: index.md - Quickstart: quickstart-python.md - Programming Guide: programming-guide.md - - GeoPandas interop: geopandas-interop.md - - Overture Examples: overture-examples.md - - CRS Examples: crs-examples.md - - Working with Parquet Files: working-with-parquet-files.md - - Working with SQL in SedonaDB: working-with-sql-sedonadb.md - - Contributors Guide: contributors-guide.md - API: - Python: reference/python.md - SQL: reference/sql.md - Spatial Joins: reference/sql-joins.md + - Examples: + - GeoPandas interop: geopandas-interop.md + - Overture Examples: overture-examples.md + - CRS Examples: crs-examples.md + - Delta Lake: delta-lake.md + - Working with Parquet Files: working-with-parquet-files.md + - Working with SQL in SedonaDB: working-with-sql-sedonadb.md + - Contributors Guide: contributors-guide.md - SpatialBench: https://sedona.apache.org/spatialbench/ - Blog: https://sedona.apache.org/latest/blog/ - Community: https://sedona.apache.org/latest/setup/compile/