diff --git a/src/ca_biositing/datamodels/utils/database_joins.ipynb b/src/ca_biositing/datamodels/utils/database_joins.ipynb new file mode 100644 index 00000000..dbb836f2 --- /dev/null +++ b/src/ca_biositing/datamodels/utils/database_joins.ipynb @@ -0,0 +1,570 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Database Joins and Denormalized Views\n", + "\n", + "This notebook demonstrates how to use SQLAlchemy to perform complex joins on the project's data models and create denormalized views suitable for API endpoints.\n", + "\n", + "It replaces the older `database_interaction.ipynb` and uses the new generated data models.\n", + "\n", + "**Note:** Ensure you are running this notebook in the project's Pixi environment where `ca-biositing-datamodels` is installed." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Python Executable: /Users/pjsmitty301/ca-biositing/.pixi/envs/default/bin/python\n", + "✅ You are running in a Pixi environment.\n" + ] + } + ], + "source": [ + "# Verify Kernel\n", + "import sys\n", + "print(f\"Python Executable: {sys.executable}\")\n", + "\n", + "# Check if we are in the pixi environment (path should contain .pixi)\n", + "if \".pixi\" in sys.executable:\n", + " print(\"✅ You are running in a Pixi environment.\")\n", + "else:\n", + " print(\"⚠️ You might be running in the wrong kernel. Please select the 'default' kernel created by Pixi.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Libraries and models imported successfully.\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from sqlalchemy import create_engine, select\n", + "from sqlalchemy.orm import Session\n", + "\n", + "# Import models from ca_biositing.py as the source of truth\n", + "from ca_biositing.datamodels.schemas.generated.ca_biositing import *\n", + "\n", + "print(\"Libraries and models imported successfully.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Connected to database.\n" + ] + } + ], + "source": [ + "# Database Connection\n", + "DATABASE_URL = \"postgresql+psycopg2://biocirv_user:biocirv_dev_password@localhost:5432/biocirv_db\"\n", + "engine = create_engine(DATABASE_URL)\n", + "print(f\"Connected to database.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Join LandIQ Record and Observation\n", + "\n", + "We will join `LandiqRecord` and `Observation` tables on `observation.record_id = landiq_record.id`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "ename": "TypeError", + "evalue": " is not a generic class", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mTypeError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[34]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m stmt2 = select(LandiqRecord, PrimaryCrop).join(\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m \u001b[43mPrimaryCrop\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mname\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m, \n\u001b[32m 3\u001b[39m LandiqRecord.main_crop == PrimaryCrop.id).limit(\u001b[32m10\u001b[39m)\n\u001b[32m 5\u001b[39m \u001b[38;5;28mprint\u001b[39m(stmt2)\n\u001b[32m 7\u001b[39m df2 = pd.read_sql(stmt2, engine)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/ca-biositing/.pixi/envs/default/lib/python3.12/typing.py:398\u001b[39m, in \u001b[36m_tp_cache..decorator..inner\u001b[39m\u001b[34m(*args, **kwds)\u001b[39m\n\u001b[32m 396\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[32m 397\u001b[39m \u001b[38;5;28;01mpass\u001b[39;00m \u001b[38;5;66;03m# All real errors (not unhashable args) are raised below.\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m398\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/ca-biositing/.pixi/envs/default/lib/python3.12/typing.py:1110\u001b[39m, in \u001b[36m_generic_class_getitem\u001b[39m\u001b[34m(cls, params)\u001b[39m\n\u001b[32m 1108\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m prepare \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 1109\u001b[39m params = prepare(\u001b[38;5;28mcls\u001b[39m, params)\n\u001b[32m-> \u001b[39m\u001b[32m1110\u001b[39m \u001b[43m_check_generic\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m__parameters__\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1112\u001b[39m new_args = []\n\u001b[32m 1113\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m param, new_arg \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(\u001b[38;5;28mcls\u001b[39m.__parameters__, params):\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/ca-biositing/.pixi/envs/default/lib/python3.12/site-packages/typing_extensions.py:3111\u001b[39m, in \u001b[36m_check_generic\u001b[39m\u001b[34m(cls, parameters, elen)\u001b[39m\n\u001b[32m 3106\u001b[39m \u001b[38;5;250m\u001b[39m\u001b[33;03m\"\"\"Check correct count for parameters of a generic cls (internal helper).\u001b[39;00m\n\u001b[32m 3107\u001b[39m \n\u001b[32m 3108\u001b[39m \u001b[33;03mThis gives a nice error message in case of count mismatch.\u001b[39;00m\n\u001b[32m 3109\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 3110\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m elen:\n\u001b[32m-> \u001b[39m\u001b[32m3111\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mcls\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m is not a generic class\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 3112\u001b[39m alen = \u001b[38;5;28mlen\u001b[39m(parameters)\n\u001b[32m 3113\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m alen != elen:\n", + "\u001b[31mTypeError\u001b[39m: is not a generic class" + ] + } + ], + "source": [ + "stmt2 = select(LandiqRecord, PrimaryCrop).join(\n", + " PrimaryCrop, \n", + " LandiqRecord.main_crop == PrimaryCrop.id).limit(10)\n", + "\n", + "print(stmt2)\n", + "\n", + "df2 = pd.read_sql(stmt2, engine)\n", + "\n", + "df2.columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SQL Query:\n", + "SELECT landiq_record.dataset_id, landiq_record.polygon_id, landiq_record.main_crop, landiq_record.secondary_crop, landiq_record.tertiary_crop, landiq_record.quaternary_crop, landiq_record.confidence, landiq_record.irrigated, landiq_record.acres, landiq_record.version, landiq_record.note, landiq_record.test, landiq_record.id, landiq_record.created_at, landiq_record.updated_at, landiq_record.etl_run_id, landiq_record.lineage_group_id, observation.dataset_id AS dataset_id_1, observation.record_type, observation.record_id, observation.parameter_id, observation.value, observation.unit_id, observation.dimension_type_id, observation.dimension_value, observation.dimension_unit_id, observation.note AS note_1, observation.id AS id_1, observation.created_at AS created_at_1, observation.updated_at AS updated_at_1, observation.etl_run_id AS etl_run_id_1, observation.lineage_group_id AS lineage_group_id_1 \n", + "FROM landiq_record JOIN observation ON observation.record_id = landiq_record.id\n" + ] + } + ], + "source": [ + "# Create the join query\n", + "stmt = select(LandiqRecord, Observation).join(\n", + " Observation,\n", + " Observation.record_id == LandiqRecord.id\n", + ")\n", + "\n", + "print(\"SQL Query:\")\n", + "print(stmt)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded 0 rows.\n" + ] + }, + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "dataset_id", + "rawType": "object", + "type": "string" + }, + { + "name": "polygon_id", + "rawType": "object", + "type": "string" + }, + { + "name": "main_crop", + "rawType": "object", + "type": "string" + }, + { + "name": "secondary_crop", + "rawType": "object", + "type": "string" + }, + { + "name": "tertiary_crop", + "rawType": "object", + "type": "string" + }, + { + "name": "quaternary_crop", + "rawType": "object", + "type": "string" + }, + { + "name": "confidence", + "rawType": "object", + "type": "string" + }, + { + "name": "irrigated", + "rawType": "object", + "type": "string" + }, + { + "name": "acres", + "rawType": "object", + "type": "string" + }, + { + "name": "version", + "rawType": "object", + "type": "string" + }, + { + "name": "note", + "rawType": "object", + "type": "string" + }, + { + "name": "test", + "rawType": "object", + "type": "string" + }, + { + "name": "id", + "rawType": "object", + "type": "string" + }, + { + "name": "created_at", + "rawType": "object", + "type": "string" + }, + { + "name": "updated_at", + "rawType": "object", + "type": "string" + }, + { + "name": "etl_run_id", + "rawType": "object", + "type": "string" + }, + { + "name": "lineage_group_id", + "rawType": "object", + "type": "string" + }, + { + "name": "dataset_id_1", + "rawType": "object", + "type": "string" + }, + { + "name": "record_type", + "rawType": "object", + "type": "string" + }, + { + "name": "record_id", + "rawType": "object", + "type": "string" + }, + { + "name": "parameter_id", + "rawType": "object", + "type": "string" + }, + { + "name": "value", + "rawType": "object", + "type": "string" + }, + { + "name": "unit_id", + "rawType": "object", + "type": "string" + }, + { + "name": "dimension_type_id", + "rawType": "object", + "type": "string" + }, + { + "name": "dimension_value", + "rawType": "object", + "type": "string" + }, + { + "name": "dimension_unit_id", + "rawType": "object", + "type": "string" + }, + { + "name": "note_1", + "rawType": "object", + "type": "string" + }, + { + "name": "id_1", + "rawType": "object", + "type": "string" + }, + { + "name": "created_at_1", + "rawType": "object", + "type": "string" + }, + { + "name": "updated_at_1", + "rawType": "object", + "type": "string" + }, + { + "name": "etl_run_id_1", + "rawType": "object", + "type": "string" + }, + { + "name": "lineage_group_id_1", + "rawType": "object", + "type": "string" + } + ], + "ref": "edc059e1-7169-4eb9-ad76-c55b46d93053", + "rows": [], + "shape": { + "columns": 32, + "rows": 0 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
dataset_idpolygon_idmain_cropsecondary_croptertiary_cropquaternary_cropconfidenceirrigatedacresversion...unit_iddimension_type_iddimension_valuedimension_unit_idnote_1id_1created_at_1updated_at_1etl_run_id_1lineage_group_id_1
\n", + "

0 rows × 32 columns

\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [dataset_id, polygon_id, main_crop, secondary_crop, tertiary_crop, quaternary_crop, confidence, irrigated, acres, version, note, test, id, created_at, updated_at, etl_run_id, lineage_group_id, dataset_id_1, record_type, record_id, parameter_id, value, unit_id, dimension_type_id, dimension_value, dimension_unit_id, note_1, id_1, created_at_1, updated_at_1, etl_run_id_1, lineage_group_id_1]\n", + "Index: []\n", + "\n", + "[0 rows x 32 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Execute and load into DataFrame\n", + "try:\n", + " with Session(engine) as session:\n", + " # Using pandas read_sql to execute the statement and fetch results\n", + " # Note: Columns with same names (like 'id') might be duplicated or suffixed automatically by pandas.\n", + " df = pd.read_sql(stmt, session.bind)\n", + " \n", + " print(f\"Loaded {len(df)} rows.\")\n", + " display(df.head())\n", + "except Exception as e:\n", + " print(f\"Error executing query: {e}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['dataset_id', 'polygon_id', 'main_crop', 'secondary_crop',\n", + " 'tertiary_crop', 'quaternary_crop', 'confidence', 'irrigated', 'acres',\n", + " 'version', 'note', 'test', 'id', 'created_at', 'updated_at',\n", + " 'etl_run_id', 'lineage_group_id', 'dataset_id_1', 'record_type',\n", + " 'record_id', 'parameter_id', 'value', 'unit_id', 'dimension_type_id',\n", + " 'dimension_value', 'dimension_unit_id', 'note_1', 'id_1',\n", + " 'created_at_1', 'updated_at_1', 'etl_run_id_1', 'lineage_group_id_1'],\n", + " dtype='object')" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "id", + "rawType": "object", + "type": "string" + }, + { + "name": "dataset_id", + "rawType": "object", + "type": "string" + }, + { + "name": "record_id", + "rawType": "object", + "type": "string" + }, + { + "name": "main_crop", + "rawType": "object", + "type": "string" + }, + { + "name": "parameter_id", + "rawType": "object", + "type": "string" + }, + { + "name": "value", + "rawType": "object", + "type": "string" + }, + { + "name": "unit_id", + "rawType": "object", + "type": "string" + } + ], + "ref": "5c95c323-56ec-4546-ad20-84e7b50c7748", + "rows": [], + "shape": { + "columns": 7, + "rows": 0 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddataset_idrecord_idmain_cropparameter_idvalueunit_id
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [id, dataset_id, record_id, main_crop, parameter_id, value, unit_id]\n", + "Index: []" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[[\"id\", \"dataset_id\", \"record_id\", \"main_crop\", \"parameter_id\", \"value\", \"unit_id\"]]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/ca_biositing/datamodels/utils/query_landiq_observation.py b/src/ca_biositing/datamodels/utils/query_landiq_observation.py new file mode 100644 index 00000000..6a7a0ee4 --- /dev/null +++ b/src/ca_biositing/datamodels/utils/query_landiq_observation.py @@ -0,0 +1,22 @@ +from sqlalchemy import select +from ca_biositing.datamodels.schemas.generated.ca_biositing import LandiqRecord, Observation + +def generate_query(): + """ + Generates a SQL query joining LandiqRecord and Observation tables. + """ + # Create the select statement + # We select from LandiqRecord and join Observation + # The join condition is explicitly specified as requested: + # observation.record_id = landiq_record.id + stmt = select(LandiqRecord, Observation).join( + Observation, + Observation.record_id == LandiqRecord.id + ) + + return stmt + +if __name__ == "__main__": + query = generate_query() + print("Generated SQL Query:") + print(query)