diff --git a/examples/nyc-taxi/Notebook.ipynb b/examples/nyc-taxi/Notebook.ipynb new file mode 100644 index 000000000..f9244fce9 --- /dev/null +++ b/examples/nyc-taxi/Notebook.ipynb @@ -0,0 +1,1654 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "235d821b-1ff8-4ef6-8f0b-559c95254479", + "metadata": {}, + "source": [ + "# NYC Taxi Trip-Duration Estimation" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "4a2645a7-a748-43ac-a07f-cbb4fc4d3f4e", + "metadata": {}, + "source": [ + "NOTE: This step is only needed if you haven't already" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "0b21472f-27b0-4b14-ba48-f3c4e11d36bf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: kaskada in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (0.5.1)\n", + "Requirement already satisfied: html5lib<2.0,>=1.1 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from kaskada) (1.1)\n", + "Requirement already satisfied: googleapis-common-protos<2.0.0,>=1.58.0 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from kaskada) (1.59.1)\n", + "Requirement already satisfied: grpcio-health-checking<2.0.0,>=1.54.2 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from kaskada) (1.56.0)\n", + "Requirement already satisfied: ipython==7.34.0 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from kaskada) (7.34.0)\n", + "Requirement already satisfied: tqdm<5.0.0,>=4.64.1 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from kaskada) (4.65.0)\n", + "Requirement already satisfied: grpcio<2.0.0,>=1.51.1 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from kaskada) (1.56.0)\n", + "Requirement already satisfied: domonic<0.10.0,>=0.9.11 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from kaskada) (0.9.11)\n", + "Requirement already satisfied: certifi<2023.0.0,>=2022.12.7 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from kaskada) (2022.12.7)\n", + "Requirement already satisfied: pandas<1.4,>=1.3 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from kaskada) (1.3.5)\n", + "Requirement already satisfied: requests<3.0.0,>=2.28.2 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from kaskada) (2.28.2)\n", + "Requirement already satisfied: grpcio-status<2.0.0,>=1.51.1 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from kaskada) (1.56.0)\n", + "Requirement already satisfied: pygithub<2.0,>=1.57 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from kaskada) (1.59.0)\n", + "Requirement already satisfied: pyarrow<11.0.0,>=10.0.1 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from kaskada) (10.0.1)\n", + "Requirement already satisfied: traitlets>=4.2 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from ipython==7.34.0->kaskada) (5.9.0)\n", + "Requirement already satisfied: decorator in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from ipython==7.34.0->kaskada) (5.1.1)\n", + "Requirement already satisfied: backcall in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from ipython==7.34.0->kaskada) (0.2.0)\n", + "Requirement already satisfied: jedi>=0.16 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from ipython==7.34.0->kaskada) (0.18.2)\n", + "Requirement already satisfied: pickleshare in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from ipython==7.34.0->kaskada) (0.7.5)\n", + "Requirement already satisfied: prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from ipython==7.34.0->kaskada) (3.0.39)\n", + "Requirement already satisfied: matplotlib-inline in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from ipython==7.34.0->kaskada) (0.1.6)\n", + "Requirement already satisfied: appnope in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from ipython==7.34.0->kaskada) (0.1.3)\n", + "Requirement already satisfied: pexpect>4.3 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from ipython==7.34.0->kaskada) (4.8.0)\n", + "Requirement already satisfied: setuptools>=18.5 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from ipython==7.34.0->kaskada) (58.1.0)\n", + "Requirement already satisfied: pygments in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from ipython==7.34.0->kaskada) (2.15.1)\n", + "Requirement already satisfied: urllib3~=1.26.9 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from domonic<0.10.0,>=0.9.11->kaskada) (1.26.16)\n", + "Requirement already satisfied: python-dateutil==2.8.2 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from domonic<0.10.0,>=0.9.11->kaskada) (2.8.2)\n", + "Requirement already satisfied: elementpath~=2.5.2 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from domonic<0.10.0,>=0.9.11->kaskada) (2.5.3)\n", + "Requirement already satisfied: cssselect~=1.1.0 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from domonic<0.10.0,>=0.9.11->kaskada) (1.1.0)\n", + "Requirement already satisfied: six>=1.5 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from python-dateutil==2.8.2->domonic<0.10.0,>=0.9.11->kaskada) (1.16.0)\n", + "Requirement already satisfied: protobuf!=3.20.0,!=3.20.1,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0.dev0,>=3.19.5 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from googleapis-common-protos<2.0.0,>=1.58.0->kaskada) (4.23.4)\n", + "Requirement already satisfied: webencodings in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from html5lib<2.0,>=1.1->kaskada) (0.5.1)\n", + "Requirement already satisfied: pytz>=2017.3 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from pandas<1.4,>=1.3->kaskada) (2023.3)\n", + "Requirement already satisfied: numpy>=1.21.0 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from pandas<1.4,>=1.3->kaskada) (1.25.1)\n", + "Requirement already satisfied: pyjwt[crypto]>=2.4.0 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from pygithub<2.0,>=1.57->kaskada) (2.7.0)\n", + "Requirement already satisfied: pynacl>=1.4.0 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from pygithub<2.0,>=1.57->kaskada) (1.5.0)\n", + "Requirement already satisfied: deprecated in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from pygithub<2.0,>=1.57->kaskada) (1.2.14)\n", + "Requirement already satisfied: idna<4,>=2.5 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from requests<3.0.0,>=2.28.2->kaskada) (3.4)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from requests<3.0.0,>=2.28.2->kaskada) (3.2.0)\n", + "Requirement already satisfied: parso<0.9.0,>=0.8.0 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from jedi>=0.16->ipython==7.34.0->kaskada) (0.8.3)\n", + "Requirement already satisfied: ptyprocess>=0.5 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from pexpect>4.3->ipython==7.34.0->kaskada) (0.7.0)\n", + "Requirement already satisfied: wcwidth in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0->ipython==7.34.0->kaskada) (0.2.6)\n", + "Requirement already satisfied: cryptography>=3.4.0 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from pyjwt[crypto]>=2.4.0->pygithub<2.0,>=1.57->kaskada) (41.0.2)\n", + "Requirement already satisfied: cffi>=1.4.1 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from pynacl>=1.4.0->pygithub<2.0,>=1.57->kaskada) (1.15.1)\n", + "Requirement already satisfied: wrapt<2,>=1.10 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from deprecated->pygithub<2.0,>=1.57->kaskada) (1.15.0)\n", + "Requirement already satisfied: pycparser in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from cffi>=1.4.1->pynacl>=1.4.0->pygithub<2.0,>=1.57->kaskada) (2.21)\n", + "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 23.2.1 is available.\n", + "You should consider upgrading via the '/Users/ryan.michael/.pyenv/versions/3.10.5/bin/python3.10 -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install kaskada" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "d1fbe48d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: matplotlib in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (3.7.2)\n", + "Requirement already satisfied: seaborn in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (0.12.2)\n", + "Requirement already satisfied: xgboost in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (1.7.6)\n", + "Requirement already satisfied: scikit-learn in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (1.3.0)\n", + "Requirement already satisfied: numpy>=1.20 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from matplotlib) (1.25.1)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from matplotlib) (1.1.0)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from matplotlib) (2.8.2)\n", + "Requirement already satisfied: pyparsing<3.1,>=2.3.1 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from matplotlib) (3.0.9)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from matplotlib) (4.41.0)\n", + "Requirement already satisfied: cycler>=0.10 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from matplotlib) (0.11.0)\n", + "Requirement already satisfied: pillow>=6.2.0 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from matplotlib) (10.0.0)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from matplotlib) (1.4.4)\n", + "Requirement already satisfied: packaging>=20.0 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from matplotlib) (23.1)\n", + "Requirement already satisfied: pandas>=0.25 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from seaborn) (1.3.5)\n", + "Requirement already satisfied: scipy in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from xgboost) (1.11.1)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from scikit-learn) (3.2.0)\n", + "Requirement already satisfied: joblib>=1.1.1 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from scikit-learn) (1.3.1)\n", + "Requirement already satisfied: pytz>=2017.3 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from pandas>=0.25->seaborn) (2023.3)\n", + "Requirement already satisfied: six>=1.5 in /Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\n", + "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 23.1.2 is available.\n", + "You should consider upgrading via the '/Users/ryan.michael/.pyenv/versions/3.10.5/bin/python3.10 -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install matplotlib seaborn xgboost scikit-learn" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "9a12d5ec-4059-433e-b160-13f8d0626df6", + "metadata": {}, + "source": [ + "1. Create a local, in-memory Kaskada instance" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "0b8b442f-c6a3-4b96-91c4-afbc9d718843", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:kaskada.api.release:Using latest release version: engine@v0.10.0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/ryan.michael/.pyenv/versions/3.10.5/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "kaskada-engine-darwin-arm64: 100%|██████████| 50.4M/50.4M [00:10<00:00, 5.21MB/s]\n", + "kaskada-manager-darwin-arm64: 100%|██████████| 39.3M/39.3M [00:08<00:00, 4.98MB/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:kaskada.api.local_session.local_service:Initializing manager process\n", + "INFO:kaskada.api.local_session.local_service:Logging manager STDOUT to /Users/ryan.michael/.cache/kaskada/logs/2023-07-26T15-45-45-manager-stdout.log\n", + "INFO:kaskada.api.local_session.local_service:Logging manager STDERR to /Users/ryan.michael/.cache/kaskada/logs/2023-07-26T15-45-45-manager-stdout.log\n", + "INFO:kaskada.api.local_session.local_service:Initializing engine process\n", + "INFO:kaskada.api.local_session.local_service:Logging engine STDOUT to /Users/ryan.michael/.cache/kaskada/logs/2023-07-26T15-45-45-engine-stdout.log\n", + "INFO:kaskada.api.local_session.local_service:Logging engine STDERR to /Users/ryan.michael/.cache/kaskada/logs/2023-07-26T15-45-45-engine-stdout.log\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:kaskada.client:Successfully connected.\n" + ] + } + ], + "source": [ + "from kaskada.api.session import LocalBuilder\n", + "from kaskada import table\n", + "\n", + "session = LocalBuilder().build()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "06d88408", + "metadata": {}, + "source": [ + "## Data Prep\n", + "\n", + "The included dataset is derived from the [NYC Taxi Trip Data](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page) provided by the NYC.\n", + "The raw data has been cleaned using the following queries in DuckDB\n", + "\n", + "The data has been cleaned by joining the \"zones\" data to provide the trip distance.\n", + "\n", + "```sql\n", + "INSTALL spatial;\n", + "INSTALL parquet;\n", + "LOAD spatial;\n", + "LOAD parquet;\n", + "\n", + "CREATE TABLE zones AS SELECT zone, LocationId, borough, ST_GeomFromWKB(wkb_geometry) AS geom \n", + "FROM ST_Read('./taxi_zones/taxi_zones.shx');\n", + "\n", + "copy (\n", + " select \n", + " hvfhs_license_num,\n", + " dispatching_base_num,\n", + " originating_base_num,\n", + " request_datetime,\n", + " on_scene_datetime,\n", + " pickup_datetime,\n", + " dropoff_datetime,\n", + " PULocationID AS pu_location_id,\n", + " DOLocationID AS do_location_id,\n", + " trip_miles,\n", + " trip_time,\n", + " base_passenger_fare,\n", + " tolls,\n", + " bcf,\n", + " sales_tax,\n", + " congestion_surcharge,\n", + " airport_fee,\n", + " tips,\n", + " driver_pay,\n", + " shared_request_flag,\n", + " shared_match_flag,\n", + " access_a_ride_flag,\n", + " wav_request_flag,\n", + " wav_match_flag,\n", + " PUZone.zone AS pu_zone,\n", + " PUZone.borough AS pu_borough,\n", + " DOZone.zone AS do_zone,\n", + " DOZone.borough AS do_borough,\n", + " ST_Distance( ST_Centroid(PUZone.geom), ST_Centroid(DOZone.geom)) / 5280 AS distance_miles,\n", + "\n", + " from 'fhvhv_tripdata_2023-02.parquet' \n", + " join zones as PUZone on PULocationID = PUZone.LocationID \n", + " join zones as DOZone on DOLocationID = DOZone.LocationID\n", + ") TO 'fhvhv_combined.parquet' (FORMAT PARQUET);\n", + "```\n", + "\n", + "\n", + "\n", + "Since the dataset contains timestamps for both pickup and dropoff time, the dataset has been split into separate files, one describing pickup events and another containing dropoff events. The fields present in the pickup dataset are filtered to information that could plausibly be known at pickup time, specifically the following fields are omitted:\n", + "\n", + "* dropoff_datetime, The date and time of the trip drop-off\n", + "* trip_time, total time in seconds for passenger trip\n", + "* base_passenger_fare, base passenger fare before tolls, tips, taxes, and fees\n", + "* tolls, total amount of all tolls paid in trip\n", + "* bcf, total amount collected in trip for Black Car Fund\n", + "* sales_tax, total amount collected in trip for NYS sales tax\n", + "* congestion_surcharge, total amount collected in trip for NYS congestion surcharge\n", + "* airport_fee, $2.50 for both drop off and pick up at LaGuardia, Newark, and John F. Kennedy airports\n", + "* tips, total amount of tips received from passenger \n", + "* driver_pay, total driver pay (not including tolls or tips and net of commission, surcharges, or taxes)\n", + "\n", + "Specifically, the following query is used:\n", + "\n", + "```sql\n", + "copy (\n", + " select \n", + " hvfhs_license_num,\n", + " dispatching_base_num,\n", + " originating_base_num,\n", + " request_datetime,\n", + " on_scene_datetime,\n", + " pickup_datetime,\n", + " PULocationID AS pu_location_id,\n", + " DOLocationID AS do_location_id,\n", + " trip_miles,\n", + " shared_request_flag,\n", + " shared_match_flag,\n", + " access_a_ride_flag,\n", + " wav_request_flag,\n", + " wav_match_flag,\n", + " PUZone.zone AS pu_zone,\n", + " PUZone.borough AS pu_borough,\n", + " DOZone.zone AS do_zone,\n", + " DOZone.borough AS do_borough,\n", + " ST_Distance( ST_Centroid(PUZone.geom), ST_Centroid(DOZone.geom)) / 5280 AS distance_miles,\n", + "\n", + " from 'fhvhv_tripdata_2023-02.parquet' \n", + " join zones as PUZone on PULocationID = PUZone.LocationID \n", + " join zones as DOZone on DOLocationID = DOZone.LocationID\n", + ") TO 'fhvhv_pickups.parquet' (FORMAT PARQUET);\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "0ec9a6f9-22da-459e-b456-06299d2a4116", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:fenlmagic:extension loaded\n" + ] + } + ], + "source": [ + "%load_ext fenlmagic" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "aafdfe0b-deab-4a88-8ce6-0c4d8c7e96c6", + "metadata": { + "tags": [] + }, + "source": [ + "2. Create a table for the data" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "3501e80b-5359-4425-8432-510ff7005dee", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
table
table_name
Pickup
entity_key_column_name
pu_zone
time_column_name
request_datetime
grouping_id
License
version
0
create_time
2023-07-26T17:24:57.301108
update_time
2023-07-26T17:24:57.301109
request_details
request_id
fd87a55b5b3b1ded4309643547665754
table {\n",
+       "  table_id: \"2ed03cae-65eb-4d7b-a58f-30634cbe1455\"\n",
+       "  table_name: \"Pickup\"\n",
+       "  time_column_name: \"request_datetime\"\n",
+       "  entity_key_column_name: \"pu_zone\"\n",
+       "  grouping_id: \"License\"\n",
+       "  create_time {\n",
+       "    seconds: 1690406697\n",
+       "    nanos: 301108000\n",
+       "  }\n",
+       "  update_time {\n",
+       "    seconds: 1690406697\n",
+       "    nanos: 301109000\n",
+       "  }\n",
+       "  source {\n",
+       "    kaskada {\n",
+       "    }\n",
+       "  }\n",
+       "}\n",
+       "request_details {\n",
+       "  request_id: \"fd87a55b5b3b1ded4309643547665754\"\n",
+       "}\n",
+       "
" + ], + "text/plain": [ + "table {\n", + " table_id: \"2ed03cae-65eb-4d7b-a58f-30634cbe1455\"\n", + " table_name: \"Pickup\"\n", + " time_column_name: \"request_datetime\"\n", + " entity_key_column_name: \"pu_zone\"\n", + " grouping_id: \"License\"\n", + " create_time {\n", + " seconds: 1690406697\n", + " nanos: 301108000\n", + " }\n", + " update_time {\n", + " seconds: 1690406697\n", + " nanos: 301109000\n", + " }\n", + " source {\n", + " kaskada {\n", + " }\n", + " }\n", + "}\n", + "request_details {\n", + " request_id: \"fd87a55b5b3b1ded4309643547665754\"\n", + "}" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "table.delete_table(\"Pickup\")\n", + "table.create_table(\n", + " # The table's name\n", + " table_name = \"Pickup\",\n", + " # The name of the column in the data that contains the time associated with each row\n", + " time_column_name = \"request_datetime\",\n", + " # The name of the column in the data that contains the entity key associated with each row\n", + " entity_key_column_name = \"pu_zone\",\n", + " grouping_id = \"License\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "64442f6c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
table
table_name
Dropoff
entity_key_column_name
pu_zone
time_column_name
request_datetime
grouping_id
License
version
0
create_time
2023-07-26T17:24:59.016196
update_time
2023-07-26T17:24:59.016196
request_details
request_id
b2dee8043c3d0dc5ecc3d21e6c095e21
table {\n",
+       "  table_id: \"750c5348-7fd2-4d6f-b541-2975d517285d\"\n",
+       "  table_name: \"Dropoff\"\n",
+       "  time_column_name: \"request_datetime\"\n",
+       "  entity_key_column_name: \"pu_zone\"\n",
+       "  grouping_id: \"License\"\n",
+       "  create_time {\n",
+       "    seconds: 1690406699\n",
+       "    nanos: 16196000\n",
+       "  }\n",
+       "  update_time {\n",
+       "    seconds: 1690406699\n",
+       "    nanos: 16196000\n",
+       "  }\n",
+       "  source {\n",
+       "    kaskada {\n",
+       "    }\n",
+       "  }\n",
+       "}\n",
+       "request_details {\n",
+       "  request_id: \"b2dee8043c3d0dc5ecc3d21e6c095e21\"\n",
+       "}\n",
+       "
" + ], + "text/plain": [ + "table {\n", + " table_id: \"750c5348-7fd2-4d6f-b541-2975d517285d\"\n", + " table_name: \"Dropoff\"\n", + " time_column_name: \"request_datetime\"\n", + " entity_key_column_name: \"pu_zone\"\n", + " grouping_id: \"License\"\n", + " create_time {\n", + " seconds: 1690406699\n", + " nanos: 16196000\n", + " }\n", + " update_time {\n", + " seconds: 1690406699\n", + " nanos: 16196000\n", + " }\n", + " source {\n", + " kaskada {\n", + " }\n", + " }\n", + "}\n", + "request_details {\n", + " request_id: \"b2dee8043c3d0dc5ecc3d21e6c095e21\"\n", + "}" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "table.delete_table(\"Dropoff\")\n", + "table.create_table(\n", + " # The table's name\n", + " table_name = \"Dropoff\",\n", + " # The name of the column in the data that contains the time associated with each row\n", + " time_column_name = \"request_datetime\",\n", + " # The name of the column in the data that contains the entity key associated with each row\n", + " entity_key_column_name = \"pu_zone\",\n", + " grouping_id = \"License\",\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "06deab69-e940-43ea-893b-4ae95cf931c9", + "metadata": { + "tags": [] + }, + "source": [ + "3. Load the files's contents into the Purchase table" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "cb11430f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
data_token_id
c7a71379-778e-4824-9c66-9cbcc384e439
request_details
request_id
bdbc4da06e32488f34d49d748f6cdc85
data_token_id: \"c7a71379-778e-4824-9c66-9cbcc384e439\"\n",
+       "request_details {\n",
+       "  request_id: \"bdbc4da06e32488f34d49d748f6cdc85\"\n",
+       "}\n",
+       "
" + ], + "text/plain": [ + "data_token_id: \"c7a71379-778e-4824-9c66-9cbcc384e439\"\n", + "request_details {\n", + " request_id: \"bdbc4da06e32488f34d49d748f6cdc85\"\n", + "}" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "table.load(table_name = \"Pickup\", file = \"fhvhv_pickups.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "e07a4c81-717a-4606-80a5-80d790cc2dda", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
data_token_id
1d583098-7474-471c-a9bc-ddc69c44bd42
request_details
request_id
aa42277769f00093be96cbb0a247aac2
data_token_id: \"1d583098-7474-471c-a9bc-ddc69c44bd42\"\n",
+       "request_details {\n",
+       "  request_id: \"aa42277769f00093be96cbb0a247aac2\"\n",
+       "}\n",
+       "
" + ], + "text/plain": [ + "data_token_id: \"1d583098-7474-471c-a9bc-ddc69c44bd42\"\n", + "request_details {\n", + " request_id: \"aa42277769f00093be96cbb0a247aac2\"\n", + "}" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "table.load(table_name = \"Dropoff\", file = \"fhvhv_combined.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:kaskada.client:Slicing set to: {'percent': {'percent': 2}}\n" + ] + } + ], + "source": [ + "# Downsample to a subset of licenses\n", + "from kaskada.slice_filters import EntityPercentFilter\n", + "import kaskada.client\n", + "filter_percentage = 2\n", + "entity_filter = EntityPercentFilter(filter_percentage)\n", + "kaskada.client.set_default_slice(entity_filter)" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "id": "ec8c2c82-df9f-4de4-8198-9995ca495b90", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
_time_subsort_key_hash_keyhvfhs_license_numdispatching_base_numoriginating_base_numrequest_datetimeon_scene_datetimepickup_datetimepu_location_iddo_location_idtrip_milesshared_request_flagshared_match_flagaccess_a_ride_flagwav_request_flagwav_match_flagpu_zonepu_boroughdo_zonedo_boroughdistance_miles
02023-01-31 23:36:2213355119442866649656746177895434293976East Flatbush/FarragutHV0005B03406None2023-01-31 23:36:22NaT2023-02-01 00:00:44712588.335YYNNNEast Flatbush/FarragutBrooklynWoodhavenQueens5.296522
12023-01-31 23:52:5713355119442866650606634524494153410281South Beach/Dongan HillsHV0005B03406None2023-01-31 23:52:57NaT2023-02-01 00:03:162141152.307NNNNNSouth Beach/Dongan HillsStaten IslandGrymes Hill/CliftonStaten Island2.155251
22023-01-31 23:54:4213355119442866649604507444716760075251Midtown NorthHV0003B03404B034042023-01-31 23:54:422023-01-31 23:59:042023-02-01 00:01:26163424.630NNNNMidtown NorthManhattanCentral Harlem NorthManhattan4.186137
32023-01-31 23:55:0013355119442866649673582766839127606418Co-Op CityHV0005B03406None2023-01-31 23:55:00NaT2023-02-01 00:17:48511268.109YNNNNCo-Op CityBronxHunts PointBronx5.220225
42023-01-31 23:55:1713355119442866649553507444716760075251Midtown NorthHV0005B03406None2023-01-31 23:55:17NaT2023-02-01 00:29:151631663.467YYNNYMidtown NorthManhattanMorningside HeightsManhattan3.216216
........................................................................
4857222023-02-28 23:56:3313355119442867135034813114853382412341Clinton HillHV0005B03406None2023-02-28 23:56:33NaT2023-02-28 23:58:26491811.410NNNNNClinton HillBrooklynPark SlopeBrooklyn1.573296
4857232023-02-28 23:56:4013355119442867135009813114853382412341Clinton HillHV0003B03404B034042023-02-28 23:56:402023-02-28 23:58:322023-02-28 23:59:24491882.100NNNNClinton HillBrooklynProspect-Lefferts GardensBrooklyn2.163424
4857242023-02-28 23:57:0013355119442867134907507444716760075251Midtown NorthHV0005B03406None2023-02-28 23:57:00NaT2023-02-28 23:59:23163481.254NNNNNMidtown NorthManhattanClinton EastManhattan0.661241
4857252023-02-28 23:57:2913355119442867135094813114853382412341Clinton HillHV0005B03406None2023-02-28 23:57:29NaT2023-02-28 23:59:1449170.880NNNNNClinton HillBrooklynBedfordBrooklyn0.698466
4857262023-02-28 23:58:0713355119442867134762507444716760075251Midtown NorthHV0005B03406None2023-02-28 23:58:07NaT2023-02-28 23:59:2216374.451NNNNNMidtown NorthManhattanAstoriaQueens3.043131

485727 rows × 23 columns

stateSUCCESS
query_idee9f132e-d434-49e7-b37e-d1cf9ab84d2b
metrics
time_preparing24.745s
time_computing0.614s
output_files1
analysis
can_execute
True
schema
(see Schema tab)
request_details
request_id
fa14b83e7b9106170d5f42a36f8d31d3
expression
Pickup
column_namecolumn_type
0hvfhs_license_numstring
1dispatching_base_numstring
2originating_base_numstring
3request_datetimetimestamp_us
4on_scene_datetimetimestamp_us
5pickup_datetimetimestamp_us
6pu_location_idi32
7do_location_idi32
8trip_milesf64
9shared_request_flagstring
10shared_match_flagstring
11access_a_ride_flagstring
12wav_request_flagstring
13wav_match_flagstring
14pu_zonestring
15pu_boroughstring
16do_zonestring
17do_boroughstring
18distance_milesf64
{'dataframe': 'RangeIndex: 485727 entries, 0 to 485726'\n",
+       "              'Columns: 23 entries, _time to distance_miles'\n",
+       "              'dtypes: datetime64[ns](4), float64(2), int32(2), object(13), '\n",
+       "              'uint64(2)'\n",
+       "              'memory usage: 81.5+ MB',\n",
+       " 'expression': 'Pickup',\n",
+       " 'query_response': state: STATE_SUCCESS\n",
+       "config {\n",
+       "  data_token_id: \"c7a71379-778e-4824-9c66-9cbcc384e439\"\n",
+       "  slice_request {\n",
+       "    percent {\n",
+       "      percent: 5\n",
+       "    }\n",
+       "  }\n",
+       "}\n",
+       "analysis {\n",
+       "  can_execute: true\n",
+       "  schema {\n",
+       "    fields {\n",
+       "      name: \"hvfhs_license_num\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_STRING\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"dispatching_base_num\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_STRING\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"originating_base_num\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_STRING\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"request_datetime\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_TIMESTAMP_MICROSECOND\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"on_scene_datetime\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_TIMESTAMP_MICROSECOND\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"pickup_datetime\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_TIMESTAMP_MICROSECOND\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"pu_location_id\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_I32\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"do_location_id\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_I32\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"trip_miles\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_F64\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"shared_request_flag\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_STRING\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"shared_match_flag\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_STRING\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"access_a_ride_flag\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_STRING\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"wav_request_flag\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_STRING\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"wav_match_flag\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_STRING\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"pu_zone\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_STRING\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"pu_borough\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_STRING\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"do_zone\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_STRING\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"do_borough\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_STRING\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"distance_miles\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_F64\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "  }\n",
+       "}\n",
+       "fenl_diagnostics {\n",
+       "}\n",
+       "metrics {\n",
+       "  time_preparing {\n",
+       "    seconds: 24\n",
+       "    nanos: 745455583\n",
+       "  }\n",
+       "  time_computing {\n",
+       "    nanos: 614440750\n",
+       "  }\n",
+       "  output_files: 1\n",
+       "  total_input_rows: 485727\n",
+       "  processed_input_rows: 485727\n",
+       "  produced_output_rows: 485727\n",
+       "}\n",
+       "request_details {\n",
+       "  request_id: \"fa14b83e7b9106170d5f42a36f8d31d3\"\n",
+       "}\n",
+       "query_id: \"ee9f132e-d434-49e7-b37e-d1cf9ab84d2b\"\n",
+       "destination {\n",
+       "  object_store {\n",
+       "    file_type: FILE_TYPE_PARQUET\n",
+       "    output_prefix_uri: \"file:///Users/ryan.michael/.cache/kaskada/data/results/9817b83f-211b-41b8-8d7f-5026fcc14d48/XpkAhQ_kcxxieZd87nYlgzZu5VfpItEns7TbUA/\"\n",
+       "    output_paths {\n",
+       "      paths: \"/Users/ryan.michael/.cache/kaskada/data/results/9817b83f-211b-41b8-8d7f-5026fcc14d48/XpkAhQ_kcxxieZd87nYlgzZu5VfpItEns7TbUA/ed5c01cc-3fef-4976-9f57-c987af692ad2-part-0.parquet\"\n",
+       "    }\n",
+       "  }\n",
+       "}\n",
+       "}
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%fenl --var all\n", + "Pickup" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "52fb3b92-e848-4998-a793-57b36f111d8f", + "metadata": {}, + "source": [ + "## Explore!\n", + "\n", + "Try some queries of your own in the block below. The [Reference > Queries](https://kaskada.io/docs-site/kaskada/main/developing/queries.html) section of the docs can help you get started.\n", + "\n", + "Predict\n", + "* Fare - https://www.kaggle.com/competitions/new-york-city-taxi-fare-prediction/overview\n", + "* Duration - https://www.kaggle.com/c/nyc-taxi-trip-duration\n", + "* Wait time\n", + "* Tip amount <<<\n", + "\n", + "* Time of day\n", + "* Distance\n", + "* year\n", + "* day of week\n", + "\n", + "Feature engineering based on\n", + "* https://www.kaggle.com/code/headsortails/nyc-taxi-eda-update-the-fast-the-curious\n", + "* https://www.kaggle.com/code/maheshdadhich/strength-of-visualization-python-visuals-tutorial" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "id": "5480b59a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
_time_subsort_key_hash_keytargetmonthdaydeparture_mean_speed_10mdeparture_mean_speed_60mdeparture_mean_speed_1ddeparture_count_10mdeparture_count_60mdeparture_count_1darrival_mean_speed_10marrival_mean_speed_60marrival_mean_speed_1darrival_count_10marrival_count_60marrival_count_1dhvfhs_license_numdispatching_base_numoriginating_base_numrequest_datetimeon_scene_datetimepickup_datetimepu_location_iddo_location_idtrip_milesshared_request_flagshared_match_flagaccess_a_ride_flagwav_request_flagwav_match_flagpu_zonepu_boroughdo_zonedo_boroughdistance_miles
02023-02-01 06:46:421295433853457827468933345205288917927Marine Park/Floyd Bennett Field3899116.90319016.90319016.90319011110.57582910.57582910.575829202020HV0003B03404B034042023-02-01 00:05:102023-02-01 00:13:562023-02-01 00:13:5615419811.480NNNNMarine Park/Floyd Bennett FieldBrooklynRidgewoodQueens7.794249
12023-02-01 08:06:091295433853457827469033345205288917927Marine Park/Floyd Bennett Field550111.56363411.56363414.2334121129.84822010.00448610.87743927118254HV0003B03404B034042023-02-01 06:46:422023-02-01 06:55:152023-02-01 06:55:2115423717.140NNNNMarine Park/Floyd Bennett FieldBrooklynUpper East Side SouthManhattan12.524059
22023-02-01 09:42:511295433853457827469233345205288917927Marine Park/Floyd Bennett Field767110.44929710.44929712.97204111312.2361269.28840510.9560531690249HV0003B03404B034042023-02-01 08:06:092023-02-01 08:13:362023-02-01 08:14:121541551.380NNNNMarine Park/Floyd Bennett FieldBrooklynMarine Park/Mill BasinBrooklyn1.596420
32023-02-01 09:53:001295433853457827469133345205288917927Marine Park/Floyd Bennett Field2017119.64920819.64920814.64133311410.3561729.2522308.89244420126764HV0003B03404B034042023-02-01 09:42:512023-02-01 09:48:092023-02-01 09:48:11154723.430NNNNMarine Park/Floyd Bennett FieldBrooklynEast Flatbush/Remsen VillageBrooklyn4.186373
42023-02-01 11:45:191295433853457827469333345205288917927Marine Park/Floyd Bennett Field682112.55296216.10108514.2236581259.1724678.4735729.25285330174895HV0003B03404B034042023-02-01 09:53:002023-02-01 09:57:462023-02-01 09:57:461541416.260NNNNMarine Park/Floyd Bennett FieldBrooklynBay RidgeBrooklyn7.033146
..................................................................................................................
11052023-02-28 22:05:511295433853457827579433345205288917927Marine Park/Floyd Bennett Field11882813.76471628.37306715.46307912299.1365838.7161877.661471412744398HV0003B03404B034042023-02-28 21:00:082023-02-28 21:04:422023-02-28 21:04:441541886.370NNNNMarine Park/Floyd Bennett FieldBrooklynProspect-Lefferts GardensBrooklyn5.077651
11062023-02-28 22:34:461295433853457827579533345205288917927Marine Park/Floyd Bennett Field8502828.91259228.91259216.317063112413.14184711.6270729.685040171164311HV0003B03404B034042023-02-28 22:05:512023-02-28 22:10:382023-02-28 22:11:0215413013.010YNNNMarine Park/Floyd Bennett FieldBrooklynJamaicaQueens9.541155
11072023-02-28 22:56:251295433853457827579633345205288917927Marine Park/Floyd Bennett Field8242817.73052123.32155716.373601122511.10477710.3346818.787921271282767HV0005B03406None2023-02-28 22:34:46NaT2023-02-28 22:38:09154724.107NNNNNMarine Park/Floyd Bennett FieldBrooklynEast Flatbush/Remsen VillageBrooklyn4.186373
11082023-02-28 23:04:361295433853457827579833345205288917927Marine Park/Floyd Bennett Field487288.72485818.45599016.079419132610.77358311.1745839.45134916681983HV0005B03406None2023-02-28 22:56:25NaT2023-02-28 23:04:591542102.643NNNNNMarine Park/Floyd Bennett FieldBrooklynSheepshead BayBrooklyn1.997023
11092023-02-28 23:43:501295433853457827579733345205288917927Marine Park/Floyd Bennett Field16762811.74362317.53259016.030640242712.34364411.8522239.42339010611932HV0003B03404B034042023-02-28 23:04:362023-02-28 23:08:352023-02-28 23:09:181542102.050NNNNMarine Park/Floyd Bennett FieldBrooklynSheepshead BayBrooklyn1.997023

1110 rows × 37 columns

stateSUCCESS
query_id754fd0a4-7d1a-421a-82ac-0e1eee9bc3a9
metrics
time_preparing0.01s
time_computing15.362s
output_files1
analysis
can_execute
True
fenl_diagnostics
(see Diagnostics tab)
schema
(see Schema tab)
request_details
request_id
b70d11defd5ebd4ab5b288247f2f0aeb
expression
# Cross-entity features\n",
+       "\n",
+       "# Compute some per-trip metrics for Dropoffs\n",
+       "let trip_speed = Dropoff.distance_miles / Dropoff.trip_time * 60 * 60\n",
+       "let dropoff_with_metrics = Dropoff | extend({trip_speed})\n",
+       "\n",
+       "# Re-key by trip source and destination\n",
+       "let dropoff_by_src_bin = dropoff_with_metrics | with_key(Dropoff.pu_location_id)\n",
+       "let dropoff_by_dst_bin = dropoff_with_metrics | with_key(Dropoff.do_location_id)\n",
+       "\n",
+       "# Compute aggregates related to trips departing from a given bin\n",
+       "let departure_mean_speed_10m = dropoff_by_src_bin.trip_speed | mean(window=sliding(10,minutely()))\n",
+       "let departure_mean_speed_60m = dropoff_by_src_bin.trip_speed | mean(window=sliding(60,minutely()))\n",
+       "let departure_mean_speed_1d = dropoff_by_src_bin.trip_speed | mean(window=sliding(24,hourly()))\n",
+       "let departure_count_10m = dropoff_by_src_bin | count(window=sliding(10, minutely()))\n",
+       "let departure_count_60m = dropoff_by_src_bin | count(window=sliding(60, minutely()))\n",
+       "let departure_count_1d = dropoff_by_src_bin | count(window=sliding(24, hourly()))\n",
+       "\n",
+       "# Compute aggregates related to trips arriving at a given bin\n",
+       "let arrival_mean_speed_10m = dropoff_by_dst_bin.trip_speed | mean(window=sliding(10,minutely()))\n",
+       "let arrival_mean_speed_60m = dropoff_by_dst_bin.trip_speed | mean(window=sliding(60,minutely()))\n",
+       "let arrival_mean_speed_1d = dropoff_by_dst_bin.trip_speed | mean(window=sliding(24,hourly()))\n",
+       "let arrival_count_10m = dropoff_by_dst_bin | count(window=sliding(10, minutely()))\n",
+       "let arrival_count_60m = dropoff_by_dst_bin | count(window=sliding(60, minutely()))\n",
+       "let arrival_count_1d = dropoff_by_dst_bin | count(window=sliding(24, hourly()))\n",
+       "\n",
+       "#in Pickup | remove_fields($input, \"request_datetime\", \"on_scene_datetime\", \"pickup_datetime\") | extend({\n",
+       "in Pickup | extend({\n",
+       "\n",
+       "    # TODO:\n",
+       "    # hour of day\n",
+       "    # day of week\n",
+       "    #distance_miles: Pickup.distance_miles,\n",
+       "\n",
+       "    monthday: day_of_month(Pickup.pickup_datetime as timestamp_ns) | else(-1),\n",
+       "\n",
+       "    # Features related to recent trips departing from the same area\n",
+       "    departure_mean_speed_10m: departure_mean_speed_10m | lookup(Pickup.pu_location_id),\n",
+       "    departure_mean_speed_60m: departure_mean_speed_60m | lookup(Pickup.pu_location_id),\n",
+       "    departure_mean_speed_1d: departure_mean_speed_1d | lookup(Pickup.pu_location_id),\n",
+       "    departure_count_10m: departure_count_10m | lookup(Pickup.pu_location_id),\n",
+       "    departure_count_60m: departure_count_60m | lookup(Pickup.pu_location_id),\n",
+       "    departure_count_1d: departure_count_1d | lookup(Pickup.pu_location_id),\n",
+       "\n",
+       "    # Features related to recent trips arriving in the same area\n",
+       "    arrival_mean_speed_10m: arrival_mean_speed_10m | lookup(Pickup.do_location_id),\n",
+       "    arrival_mean_speed_60m: arrival_mean_speed_60m | lookup(Pickup.do_location_id),\n",
+       "    arrival_mean_speed_1d: arrival_mean_speed_1d | lookup(Pickup.do_location_id),\n",
+       "    arrival_count_10m: arrival_count_10m | lookup(Pickup.do_location_id),\n",
+       "    arrival_count_60m: arrival_count_60m | lookup(Pickup.do_location_id),\n",
+       "    arrival_count_1d: arrival_count_1d | lookup(Pickup.do_location_id),\n",
+       "})\n",
+       "# We'll make predictions from features computed at the time of each pickup\n",
+       "| when(is_valid(Pickup))\n",
+       "\n",
+       "# We'll predict the duration of the trip, which we learn at the time of the next dropoff\n",
+       "| last(window=since(is_valid(Dropoff)))\n",
+       "| when(is_valid($input) and is_valid(Dropoff))\n",
+       "| extend({target: Dropoff.trip_time})\n",
+       "\n",
+       "# cleaning\n",
+       "| when($input.distance_miles < 50) # distance outliers\n",
+       "| when($input.target < 24 * 60 * 60) # trips longer than a day\n",
+       "| when($input.target > 60) # trips shorter than a minute
column_namecolumn_type
0targeti64
1monthdayi64
2departure_mean_speed_10mf64
3departure_mean_speed_60mf64
4departure_mean_speed_1df64
5departure_count_10mu32
6departure_count_60mu32
7departure_count_1du32
8arrival_mean_speed_10mf64
9arrival_mean_speed_60mf64
10arrival_mean_speed_1df64
11arrival_count_10mu32
12arrival_count_60mu32
13arrival_count_1du32
14hvfhs_license_numstring
15dispatching_base_numstring
16originating_base_numstring
17request_datetimetimestamp_us
18on_scene_datetimetimestamp_us
19pickup_datetimetimestamp_us
20pu_location_idi32
21do_location_idi32
22trip_milesf64
23shared_request_flagstring
24shared_match_flagstring
25access_a_ride_flagstring
26wav_request_flagstring
27wav_match_flagstring
28pu_zonestring
29pu_boroughstring
30do_zonestring
31do_boroughstring
32distance_milesf64
0
warning[W2000]: Incompatible time domains\n",
+       "   --> Query:57:3\n",
+       "   |\n",
+       "57 | | last(window=since(is_valid(Dropoff)))\n",
+       "   |   ^^^^        ------------------------ Time Domain: Table 'Dropoff'\n",
+       "   |   |            \n",
+       "   |   Incompatible time domains for operation\n",
+       "   |\n",
+       "   --> internal:1:1\n",
+       "   |\n",
+       " 1 | $input\n",
+       "   | ------ Time Domain: Table 'Pickup'\n",
+       "\n",
+       "
{'dataframe': 'RangeIndex: 1110 entries, 0 to 1109'\n",
+       "              'Columns: 37 entries, _time to distance_miles'\n",
+       "              'dtypes: datetime64[ns](4), float64(8), int32(2), int64(2), '\n",
+       "              'object(13), uint32(6), uint64(2)'\n",
+       "              'memory usage: 286.3+ KB',\n",
+       " 'expression': '# Cross-entity features'\n",
+       "               ''\n",
+       "               '# Compute some per-trip metrics for Dropoffs'\n",
+       "               'let trip_speed = Dropoff.distance_miles / Dropoff.trip_time * '\n",
+       "               '60 * 60'\n",
+       "               'let dropoff_with_metrics = Dropoff | extend({trip_speed})'\n",
+       "               ''\n",
+       "               '# Re-key by trip source and destination'\n",
+       "               'let dropoff_by_src_bin = dropoff_with_metrics | '\n",
+       "               'with_key(Dropoff.pu_location_id)'\n",
+       "               'let dropoff_by_dst_bin = dropoff_with_metrics | '\n",
+       "               'with_key(Dropoff.do_location_id)'\n",
+       "               ''\n",
+       "               '# Compute aggregates related to trips departing from a given '\n",
+       "               'bin'\n",
+       "               'let departure_mean_speed_10m = dropoff_by_src_bin.trip_speed | '\n",
+       "               'mean(window=sliding(10,minutely()))'\n",
+       "               'let departure_mean_speed_60m = dropoff_by_src_bin.trip_speed | '\n",
+       "               'mean(window=sliding(60,minutely()))'\n",
+       "               'let departure_mean_speed_1d = dropoff_by_src_bin.trip_speed | '\n",
+       "               'mean(window=sliding(24,hourly()))'\n",
+       "               'let departure_count_10m = dropoff_by_src_bin | '\n",
+       "               'count(window=sliding(10, minutely()))'\n",
+       "               'let departure_count_60m = dropoff_by_src_bin | '\n",
+       "               'count(window=sliding(60, minutely()))'\n",
+       "               'let departure_count_1d = dropoff_by_src_bin | '\n",
+       "               'count(window=sliding(24, hourly()))'\n",
+       "               ''\n",
+       "               '# Compute aggregates related to trips arriving at a given bin'\n",
+       "               'let arrival_mean_speed_10m = dropoff_by_dst_bin.trip_speed | '\n",
+       "               'mean(window=sliding(10,minutely()))'\n",
+       "               'let arrival_mean_speed_60m = dropoff_by_dst_bin.trip_speed | '\n",
+       "               'mean(window=sliding(60,minutely()))'\n",
+       "               'let arrival_mean_speed_1d = dropoff_by_dst_bin.trip_speed | '\n",
+       "               'mean(window=sliding(24,hourly()))'\n",
+       "               'let arrival_count_10m = dropoff_by_dst_bin | '\n",
+       "               'count(window=sliding(10, minutely()))'\n",
+       "               'let arrival_count_60m = dropoff_by_dst_bin | '\n",
+       "               'count(window=sliding(60, minutely()))'\n",
+       "               'let arrival_count_1d = dropoff_by_dst_bin | '\n",
+       "               'count(window=sliding(24, hourly()))'\n",
+       "               ''\n",
+       "               '#in Pickup | remove_fields($input, \"request_datetime\", '\n",
+       "               '\"on_scene_datetime\", \"pickup_datetime\") | extend({'\n",
+       "               'in Pickup | extend({'\n",
+       "               ''\n",
+       "               '    # TODO:'\n",
+       "               '    # hour of day'\n",
+       "               '    # day of week'\n",
+       "               '    #distance_miles: Pickup.distance_miles,'\n",
+       "               ''\n",
+       "               '    monthday: day_of_month(Pickup.pickup_datetime as '\n",
+       "               'timestamp_ns) | else(-1),'\n",
+       "               ''\n",
+       "               '    # Features related to recent trips departing from the same '\n",
+       "               'area'\n",
+       "               '    departure_mean_speed_10m: departure_mean_speed_10m | '\n",
+       "               'lookup(Pickup.pu_location_id),'\n",
+       "               '    departure_mean_speed_60m: departure_mean_speed_60m | '\n",
+       "               'lookup(Pickup.pu_location_id),'\n",
+       "               '    departure_mean_speed_1d: departure_mean_speed_1d | '\n",
+       "               'lookup(Pickup.pu_location_id),'\n",
+       "               '    departure_count_10m: departure_count_10m | '\n",
+       "               'lookup(Pickup.pu_location_id),'\n",
+       "               '    departure_count_60m: departure_count_60m | '\n",
+       "               'lookup(Pickup.pu_location_id),'\n",
+       "               '    departure_count_1d: departure_count_1d | '\n",
+       "               'lookup(Pickup.pu_location_id),'\n",
+       "               ''\n",
+       "               '    # Features related to recent trips arriving in the same '\n",
+       "               'area'\n",
+       "               '    arrival_mean_speed_10m: arrival_mean_speed_10m | '\n",
+       "               'lookup(Pickup.do_location_id),'\n",
+       "               '    arrival_mean_speed_60m: arrival_mean_speed_60m | '\n",
+       "               'lookup(Pickup.do_location_id),'\n",
+       "               '    arrival_mean_speed_1d: arrival_mean_speed_1d | '\n",
+       "               'lookup(Pickup.do_location_id),'\n",
+       "               '    arrival_count_10m: arrival_count_10m | '\n",
+       "               'lookup(Pickup.do_location_id),'\n",
+       "               '    arrival_count_60m: arrival_count_60m | '\n",
+       "               'lookup(Pickup.do_location_id),'\n",
+       "               '    arrival_count_1d: arrival_count_1d | '\n",
+       "               'lookup(Pickup.do_location_id),'\n",
+       "               '})'\n",
+       "               \"# We'll make predictions from features computed at the time of \"\n",
+       "               'each pickup'\n",
+       "               '| when(is_valid(Pickup))'\n",
+       "               ''\n",
+       "               \"# We'll predict the duration of the trip, which we learn at \"\n",
+       "               'the time of the next dropoff'\n",
+       "               '| last(window=since(is_valid(Dropoff)))'\n",
+       "               '| when(is_valid($input) and is_valid(Dropoff))'\n",
+       "               '| extend({target: Dropoff.trip_time})'\n",
+       "               ''\n",
+       "               '# cleaning'\n",
+       "               '| when($input.distance_miles < 50) # distance outliers'\n",
+       "               '| when($input.target < 24 * 60 * 60) # trips longer than a '\n",
+       "               'day'\n",
+       "               '| when($input.target > 60) # trips shorter than a minute',\n",
+       " 'query_response': state: STATE_SUCCESS\n",
+       "config {\n",
+       "  data_token_id: \"c7a71379-778e-4824-9c66-9cbcc384e439\"\n",
+       "  slice_request {\n",
+       "    percent {\n",
+       "      percent: 2\n",
+       "    }\n",
+       "  }\n",
+       "}\n",
+       "analysis {\n",
+       "  can_execute: true\n",
+       "  schema {\n",
+       "    fields {\n",
+       "      name: \"target\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_I64\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"monthday\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_I64\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"departure_mean_speed_10m\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_F64\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"departure_mean_speed_60m\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_F64\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"departure_mean_speed_1d\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_F64\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"departure_count_10m\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_U32\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"departure_count_60m\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_U32\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"departure_count_1d\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_U32\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"arrival_mean_speed_10m\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_F64\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"arrival_mean_speed_60m\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_F64\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"arrival_mean_speed_1d\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_F64\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"arrival_count_10m\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_U32\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"arrival_count_60m\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_U32\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"arrival_count_1d\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_U32\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"hvfhs_license_num\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_STRING\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"dispatching_base_num\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_STRING\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"originating_base_num\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_STRING\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"request_datetime\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_TIMESTAMP_MICROSECOND\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"on_scene_datetime\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_TIMESTAMP_MICROSECOND\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"pickup_datetime\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_TIMESTAMP_MICROSECOND\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"pu_location_id\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_I32\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"do_location_id\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_I32\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"trip_miles\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_F64\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"shared_request_flag\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_STRING\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"shared_match_flag\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_STRING\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"access_a_ride_flag\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_STRING\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"wav_request_flag\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_STRING\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"wav_match_flag\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_STRING\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"pu_zone\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_STRING\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"pu_borough\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_STRING\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"do_zone\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_STRING\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"do_borough\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_STRING\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "    fields {\n",
+       "      name: \"distance_miles\"\n",
+       "      data_type {\n",
+       "        primitive: PRIMITIVE_TYPE_F64\n",
+       "      }\n",
+       "      nullable: true\n",
+       "    }\n",
+       "  }\n",
+       "}\n",
+       "fenl_diagnostics {\n",
+       "  fenl_diagnostics {\n",
+       "    severity: SEVERITY_WARNING\n",
+       "    code: \"W2000\"\n",
+       "    message: \"Incompatible time domains\"\n",
+       "    formatted: \"warning[W2000]: Incompatible time domains   --> Query:57:3   |57 | | last(window=since(is_valid(Dropoff)))   |   ^^^^        ------------------------ Time Domain: Table \\'Dropoff\\'   |   |               |   Incompatible time domains for operation   |   --> internal:1:1   | 1 | $input   | ------ Time Domain: Table \\'Pickup\\'\"\n",
+       "  }\n",
+       "}\n",
+       "metrics {\n",
+       "  time_preparing {\n",
+       "    nanos: 10151584\n",
+       "  }\n",
+       "  time_computing {\n",
+       "    seconds: 15\n",
+       "    nanos: 362543916\n",
+       "  }\n",
+       "  output_files: 1\n",
+       "  total_input_rows: 17367583\n",
+       "  processed_input_rows: 17367583\n",
+       "  produced_output_rows: 1110\n",
+       "}\n",
+       "request_details {\n",
+       "  request_id: \"b70d11defd5ebd4ab5b288247f2f0aeb\"\n",
+       "}\n",
+       "query_id: \"754fd0a4-7d1a-421a-82ac-0e1eee9bc3a9\"\n",
+       "destination {\n",
+       "  object_store {\n",
+       "    file_type: FILE_TYPE_PARQUET\n",
+       "    output_prefix_uri: \"file:///Users/ryan.michael/.cache/kaskada/data/results/9817b83f-211b-41b8-8d7f-5026fcc14d48/nhmM-0CVVBzzb5R3_98_-Ea2g8y0VPsws9bAUQ/\"\n",
+       "    output_paths {\n",
+       "      paths: \"/Users/ryan.michael/.cache/kaskada/data/results/9817b83f-211b-41b8-8d7f-5026fcc14d48/nhmM-0CVVBzzb5R3_98_-Ea2g8y0VPsws9bAUQ/7b9d632d-7a71-4039-9cd9-6ba04ad82179-part-0.parquet\"\n",
+       "    }\n",
+       "  }\n",
+       "}\n",
+       "}
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 107, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%fenl --var examples\n", + "\n", + "# Cross-entity features\n", + "\n", + "# Compute some per-trip metrics for Dropoffs\n", + "let trip_speed = Dropoff.distance_miles / Dropoff.trip_time * 60 * 60\n", + "let dropoff_with_metrics = Dropoff | extend({trip_speed})\n", + "\n", + "# Re-key by trip source and destination\n", + "let dropoff_by_src_bin = dropoff_with_metrics | with_key(Dropoff.pu_location_id)\n", + "let dropoff_by_dst_bin = dropoff_with_metrics | with_key(Dropoff.do_location_id)\n", + "\n", + "# Compute aggregates related to trips departing from a given bin\n", + "let departure_mean_speed_10m = dropoff_by_src_bin.trip_speed | mean(window=sliding(10,minutely()))\n", + "let departure_mean_speed_60m = dropoff_by_src_bin.trip_speed | mean(window=sliding(60,minutely()))\n", + "let departure_mean_speed_1d = dropoff_by_src_bin.trip_speed | mean(window=sliding(24,hourly()))\n", + "let departure_count_10m = dropoff_by_src_bin | count(window=sliding(10, minutely()))\n", + "let departure_count_60m = dropoff_by_src_bin | count(window=sliding(60, minutely()))\n", + "let departure_count_1d = dropoff_by_src_bin | count(window=sliding(24, hourly()))\n", + "\n", + "# Compute aggregates related to trips arriving at a given bin\n", + "let arrival_mean_speed_10m = dropoff_by_dst_bin.trip_speed | mean(window=sliding(10,minutely()))\n", + "let arrival_mean_speed_60m = dropoff_by_dst_bin.trip_speed | mean(window=sliding(60,minutely()))\n", + "let arrival_mean_speed_1d = dropoff_by_dst_bin.trip_speed | mean(window=sliding(24,hourly()))\n", + "let arrival_count_10m = dropoff_by_dst_bin | count(window=sliding(10, minutely()))\n", + "let arrival_count_60m = dropoff_by_dst_bin | count(window=sliding(60, minutely()))\n", + "let arrival_count_1d = dropoff_by_dst_bin | count(window=sliding(24, hourly()))\n", + "\n", + "in Pickup | remove_fields($input, \"request_datetime\", \"on_scene_datetime\", \"pickup_datetime\") | extend({\n", + "\n", + " # TODO:\n", + " # hour of day\n", + " # day of week\n", + " #distance_miles: Pickup.distance_miles,\n", + "\n", + " monthday: day_of_month(Pickup.pickup_datetime as timestamp_ns) | else(-1),\n", + "\n", + " # Features related to recent trips departing from the same area\n", + " departure_mean_speed_10m: departure_mean_speed_10m | lookup(Pickup.pu_location_id),\n", + " departure_mean_speed_60m: departure_mean_speed_60m | lookup(Pickup.pu_location_id),\n", + " departure_mean_speed_1d: departure_mean_speed_1d | lookup(Pickup.pu_location_id),\n", + " departure_count_10m: departure_count_10m | lookup(Pickup.pu_location_id),\n", + " departure_count_60m: departure_count_60m | lookup(Pickup.pu_location_id),\n", + " departure_count_1d: departure_count_1d | lookup(Pickup.pu_location_id),\n", + "\n", + " # Features related to recent trips arriving in the same area\n", + " arrival_mean_speed_10m: arrival_mean_speed_10m | lookup(Pickup.do_location_id),\n", + " arrival_mean_speed_60m: arrival_mean_speed_60m | lookup(Pickup.do_location_id),\n", + " arrival_mean_speed_1d: arrival_mean_speed_1d | lookup(Pickup.do_location_id),\n", + " arrival_count_10m: arrival_count_10m | lookup(Pickup.do_location_id),\n", + " arrival_count_60m: arrival_count_60m | lookup(Pickup.do_location_id),\n", + " arrival_count_1d: arrival_count_1d | lookup(Pickup.do_location_id),\n", + "})\n", + "# We'll make predictions from features computed at the time of each pickup\n", + "| when(is_valid(Pickup))\n", + "\n", + "# We'll predict the duration of the trip, which we learn at the time of the next dropoff\n", + "| last(window=since(is_valid(Dropoff)))\n", + "| when(is_valid($input) and is_valid(Dropoff))\n", + "| extend({target: Dropoff.trip_time})\n", + "\n", + "# cleaning\n", + "| when($input.distance_miles < 50) # distance outliers\n", + "| when($input.target < 24 * 60 * 60) # trips longer than a day\n", + "| when($input.target > 60) # trips shorter than a minute" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "id": "1836d268", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAABSoAAAMtCAYAAACYeU88AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8pXeV/AAAACXBIWXMAAA9hAAAPYQGoP6dpAAA/j0lEQVR4nO3de5DV9X3/8deCsIBxQTSw7IhINPESLyQYcRvjaEQWwjgh0kw0NkVLdHTAKWyjkYxBlHRsSb1WDJMmip1KqnYa04iDbLBCratWlBpMdIw1QzK6mKq4EeOysvv7I+X8XFBgXeDD4uMxs6PnfD97zvvs7Mezeeac863q7OzsDAAAAABAQX1KDwAAAAAAIFQCAAAAAMUJlQAAAABAcUIlAAAAAFCcUAkAAAAAFCdUAgAAAADFCZUAAAAAQHH7lR5gb9bR0ZGXXnopBxxwQKqqqkqPAwAAAAC9SmdnZ37/+9+nrq4uffps/zWTQuV2vPTSSxk5cmTpMQAAAACgV/vNb36TQw45ZLtrhMrtOOCAA5L88QdZU1NTeJrdo729PcuXL8+ECRPSr1+/0uNAr2QfQc/ZR9Bz9hH0nH0EPWcfsbXW1taMHDmy0tm2R6jcji1v966pqdmnQ+WgQYNSU1PjPyDwAdlH0HP2EfScfQQ9Zx9Bz9lHvJ+d+VhFJ9MBAAAAAIoTKgEAAACA4oRKAAAAAKA4oRIAAAAAKE6oBAAAAACKEyoBAAAAgOKESgAAAACgOKESAAAAAChOqAQAAAAAihMqAQAAAIDihEoAAAAAoDihEgAAAAAoTqgEAAAAAIoTKgEAAACA4oRKAAAAAKA4oRIAAAAAKE6oBAAAAACKEyoBAAAAgOKESgAAAACgOKESAAAAAChOqAQAAAAAihMqAQAAAIDihEoAAAAAoDihEgAAAAAoTqgEAAAAAIoTKgEAAACA4oRKAAAAAKC4boXKa6+9Np/5zGdywAEHZNiwYZkyZUqee+65LmtOO+20VFVVdfm6+OKLu6xZt25dJk+enEGDBmXYsGG57LLL8s4773RZ89BDD+XTn/50qqurc8QRR2Tx4sXbzLNw4cIcdthhGTBgQMaNG5fHH3+8y/G33347M2bMyEEHHZSPfOQjmTp1atavX9+dhwwAAAAA7AHdCpUrV67MjBkz8uijj6apqSnt7e2ZMGFCNm7c2GXdhRdemJdffrnytWDBgsqxzZs3Z/Lkydm0aVMeeeSR3HHHHVm8eHHmzp1bWfPiiy9m8uTJOf3007NmzZrMmjUrX//61/PAAw9U1tx1111pbGzMVVddlSeffDInnHBCGhoa8sorr1TWzJ49Oz/96U9zzz33ZOXKlXnppZdy9tlnd/uHBAAAAADsXvt1Z/GyZcu6XF68eHGGDRuW1atX59RTT61cP2jQoNTW1r7nbSxfvjy/+MUv8rOf/SzDhw/PmDFjMn/+/Hzzm9/MvHnz0r9//yxatCijR4/OddddlyQ5+uij8/DDD+eGG25IQ0NDkuT666/PhRdemAsuuCBJsmjRoixdujS33XZbrrjiirzxxhv54Q9/mCVLluTzn/98kuT222/P0UcfnUcffTQnn3xydx46AAAAALAbdStUbu2NN95IkgwdOrTL9XfeeWf+6Z/+KbW1tTnrrLPy7W9/O4MGDUqSNDc357jjjsvw4cMr6xsaGnLJJZfkmWeeyac+9ak0Nzdn/PjxXW6zoaEhs2bNSpJs2rQpq1evzpw5cyrH+/Tpk/Hjx6e5uTlJsnr16rS3t3e5naOOOiqHHnpompub3zNUtrW1pa2trXK5tbU1SdLe3p729vZu/3x6gy2Pa199fLAn2EfQc/YR9Jx9BD1nH0HP2UdsrTu/Cx84VHZ0dGTWrFn57Gc/m2OPPbZy/Ve/+tWMGjUqdXV1efrpp/PNb34zzz33XP71X/81SdLS0tIlUiapXG5padnumtbW1vzhD3/I66+/ns2bN7/nmmeffbZyG/3798+QIUO2WbPlfrZ27bXX5uqrr97m+uXLl1dC676qqamp9AjQ69lH0HP2EfScfQQ9Zx9Bz9lHbPHWW2/t9NoPHCpnzJiRtWvX5uGHH+5y/UUXXVT59+OOOy4jRozIGWeckRdeeCGHH374B727PWLOnDlpbGysXG5tbc3IkSMzYcKE1NTUFJxs92lvb09TU1POPPPM9OvXr/Q40CvZR9Bz9hH0nH0EPWcfQc/ZR2xtyzuWd8YHCpUzZ87Mfffdl1WrVuWQQw7Z7tpx48YlSX71q1/l8MMPT21t7TZn595yJu4tn2tZW1u7zdm5169fn5qamgwcODB9+/ZN375933PNu29j06ZN2bBhQ5dXVb57zdaqq6tTXV29zfX9+vXb5zfXh+Exwu5mH0HP2UfQc/YR9Jx9BD1nH7FFd34PuhUqOzs7c+mll+bHP/5xHnrooYwePXqH37NmzZokyYgRI5Ik9fX1+eu//uu88sorGTZsWJI/vhy4pqYmxxxzTGXN/fff3+V2mpqaUl9fnyTp379/xo4dmxUrVmTKlClJ/vhW9BUrVmTmzJlJkrFjx6Zfv35ZsWJFpk6dmiR57rnnsm7dusrtAMCucNgVS0uP0OtV9+3MgpOSY+c9kLbNVXv8/n/9N5P3+H0CAABddStUzpgxI0uWLMlPfvKTHHDAAZXPehw8eHAGDhyYF154IUuWLMkXvvCFHHTQQXn66acze/bsnHrqqTn++OOTJBMmTMgxxxyTr33ta1mwYEFaWlpy5ZVXZsaMGZVXM1588cW55ZZbcvnll+cv/uIv8uCDD+buu+/O0qX//38INjY2Ztq0aTnxxBNz0kkn5cYbb8zGjRsrZwEfPHhwpk+fnsbGxgwdOjQ1NTW59NJLU19f74zfAAAAALCX6Vao/N73vpckOe2007pcf/vtt+f8889P//7987Of/awSDUeOHJmpU6fmyiuvrKzt27dv7rvvvlxyySWpr6/P/vvvn2nTpuWaa66prBk9enSWLl2a2bNn56abbsohhxySH/zgB2loaKis+cpXvpLf/e53mTt3blpaWjJmzJgsW7asywl2brjhhvTp0ydTp05NW1tbGhoacuutt3brBwQAAAAA7H7dfuv39owcOTIrV67c4e2MGjVqm7d2b+20007LU089td01M2fOrLzV+70MGDAgCxcuzMKFC3c4EwAAAABQTp/SAwAAAAAACJUAAAAAQHFCJQAAAABQnFAJAAAAABQnVAIAAAAAxQmVAAAAAEBxQiUAAAAAUJxQCQAAAAAUJ1QCAAAAAMUJlQAAAABAcUIlAAAAAFCcUAkAAAAAFCdUAgAAAADFCZUAAAAAQHFCJQAAAABQnFAJAAAAABQnVAIAAAAAxQmVAAAAAEBxQiUAAAAAUJxQCQAAAAAUJ1QCAAAAAMUJlQAAAABAcUIlAAAAAFCcUAkAAAAAFCdUAgAAAADFCZUAAAAAQHFCJQAAAABQnFAJAAAAABQnVAIAAAAAxQmVAAAAAEBxQiUAAAAAUJxQCQAAAAAUJ1QCAAAAAMUJlQAAAABAcUIlAAAAAFCcUAkAAAAAFCdUAgAAAADFCZUAAAAAQHFCJQAAAABQnFAJAAAAABQnVAIAAAAAxQmVAAAAAEBxQiUAAAAAUJxQCQAAAAAUJ1QCAAAAAMUJlQAAAABAcUIlAAAAAFCcUAkAAAAAFCdUAgAAAADFCZUAAAAAQHFCJQAAAABQnFAJAAAAABQnVAIAAAAAxQmVAAAAAEBxQiUAAAAAUJxQCQAAAAAUJ1QCAAAAAMUJlQAAAABAcUIlAAAAAFCcUAkAAAAAFCdUAgAAAADFCZUAAAAAQHFCJQAAAABQnFAJAAAAABQnVAIAAAAAxQmVAAAAAEBxQiUAAAAAUJxQCQAAAAAUJ1QCAAAAAMUJlQAAAABAcUIlAAAAAFCcUAkAAAAAFCdUAgAAAADFCZUAAAAAQHFCJQAAAABQnFAJAAAAABQnVAIAAAAAxQmVAAAAAEBxQiUAAAAAUJxQCQAAAAAUJ1QCAAAAAMUJlQAAAABAcUIlAAAAAFCcUAkAAAAAFCdUAgAAAADFCZUAAAAAQHFCJQAAAABQnFAJAAAAABQnVAIAAAAAxQmVAAAAAEBxQiUAAAAAUJxQCQAAAAAUJ1QCAAAAAMUJlQAAAABAcUIlAAAAAFCcUAkAAAAAFCdUAgAAAADFCZUAAAAAQHFCJQAAAABQnFAJAAAAABQnVAIAAAAAxQmVAAAAAEBxQiUAAAAAUJxQCQAAAAAUJ1QCAAAAAMUJlQAAAABAcUIlAAAAAFCcUAkAAAAAFCdUAgAAAADFCZUAAAAAQHFCJQAAAABQnFAJAAAAABQnVAIAAAAAxQmVAAAAAEBxQiUAAAAAUJxQCQAAAAAUJ1QCAAAAAMUJlQAAAABAcUIlAAAAAFCcUAkAAAAAFCdUAgAAAADFCZUAAAAAQHFCJQAAAABQnFAJAAAAABQnVAIAAAAAxQmVAAAAAEBxQiUAAAAAUJxQCQAAAAAUJ1QCAAAAAMUJlQAAAABAcUIlAAAAAFCcUAkAAAAAFCdUAgAAAADFCZUAAAAAQHFCJQAAAABQnFAJAAAAABTXrVB57bXX5jOf+UwOOOCADBs2LFOmTMlzzz3XZc3bb7+dGTNm5KCDDspHPvKRTJ06NevXr++yZt26dZk8eXIGDRqUYcOG5bLLLss777zTZc1DDz2UT3/606murs4RRxyRxYsXbzPPwoULc9hhh2XAgAEZN25cHn/88W7PAgAAAACU161QuXLlysyYMSOPPvpompqa0t7engkTJmTjxo2VNbNnz85Pf/rT3HPPPVm5cmVeeumlnH322ZXjmzdvzuTJk7Np06Y88sgjueOOO7J48eLMnTu3subFF1/M5MmTc/rpp2fNmjWZNWtWvv71r+eBBx6orLnrrrvS2NiYq666Kk8++WROOOGENDQ05JVXXtnpWQAAAACAvcN+3Vm8bNmyLpcXL16cYcOGZfXq1Tn11FPzxhtv5Ic//GGWLFmSz3/+80mS22+/PUcffXQeffTRnHzyyVm+fHl+8Ytf5Gc/+1mGDx+eMWPGZP78+fnmN7+ZefPmpX///lm0aFFGjx6d6667Lkly9NFH5+GHH84NN9yQhoaGJMn111+fCy+8MBdccEGSZNGiRVm6dGluu+22XHHFFTs1CwAAAACwd+hWqNzaG2+8kSQZOnRokmT16tVpb2/P+PHjK2uOOuqoHHrooWlubs7JJ5+c5ubmHHfccRk+fHhlTUNDQy655JI888wz+dSnPpXm5uYut7FlzaxZs5IkmzZtyurVqzNnzpzK8T59+mT8+PFpbm7e6Vm21tbWlra2tsrl1tbWJEl7e3va29s/0M9ob7flce2rjw/2BPuI6r6dpUfo9ar7dHb5555m/7Iv8HwEPWcfQc/ZR2ytO78LHzhUdnR0ZNasWfnsZz+bY489NknS0tKS/v37Z8iQIV3WDh8+PC0tLZU1746UW45vOba9Na2trfnDH/6Q119/PZs3b37PNc8+++xOz7K1a6+9NldfffU21y9fvjyDBg16vx/FPqGpqan0CNDr2UcfXgtOKj3BvmP+iR1F7vf+++8vcr+wO3g+gp6zj6Dn7CO2eOutt3Z67QcOlTNmzMjatWvz8MMPf9Cb2OvMmTMnjY2Nlcutra0ZOXJkJkyYkJqamoKT7T7t7e1pamrKmWeemX79+pUeB3ol+4hj5z2w40VsV3Wfzsw/sSPffqJP2jqq9vj9r53XsMfvE3Y1z0fQc/YR9Jx9xNa2vGN5Z3ygUDlz5szcd999WbVqVQ455JDK9bW1tdm0aVM2bNjQ5ZWM69evT21tbWXN1mfn3nIm7nev2frs3OvXr09NTU0GDhyYvn37pm/fvu+55t23saNZtlZdXZ3q6uptru/Xr98+v7k+DI8Rdjf76MOrbfOeD2v7qraOqiI/T3uXfYnnI+g5+wh6zj5ii+78HnTrrN+dnZ2ZOXNmfvzjH+fBBx/M6NGjuxwfO3Zs+vXrlxUrVlSue+6557Ju3brU19cnSerr6/Pzn/+8y9m5m5qaUlNTk2OOOaay5t23sWXNltvo379/xo4d22VNR0dHVqxYUVmzM7MAAAAAAHuHbr2icsaMGVmyZEl+8pOf5IADDqh81uPgwYMzcODADB48ONOnT09jY2OGDh2ampqaXHrppamvr6+cvGbChAk55phj8rWvfS0LFixIS0tLrrzyysyYMaPyasaLL744t9xySy6//PL8xV/8RR588MHcfffdWbp0aWWWxsbGTJs2LSeeeGJOOumk3Hjjjdm4cWPlLOA7MwsAAAAAsHfoVqj83ve+lyQ57bTTulx/++235/zzz0+S3HDDDenTp0+mTp2atra2NDQ05NZbb62s7du3b+67775ccsklqa+vz/77759p06blmmuuqawZPXp0li5dmtmzZ+emm27KIYcckh/84AdpaPj/nx/1la98Jb/73e8yd+7ctLS0ZMyYMVm2bFmXE+zsaBYAAAAAYO/QrVDZ2dm5wzUDBgzIwoULs3DhwvddM2rUqB2eXfO0007LU089td01M2fOzMyZM3s0CwAAAABQXrc+oxIAAAAAYHcQKgEAAACA4oRKAAAAAKA4oRIAAAAAKE6oBAAAAACKEyoBAAAAgOKESgAAAACgOKESAAAAAChOqAQAAAAAihMqAQAAAIDihEoAAAAAoDihEgAAAAAoTqgEAAAAAIoTKgEAAACA4oRKAAAAAKA4oRIAAAAAKE6oBAAAAACKEyoBAAAAgOKESgAAAACgOKESAAAAAChOqAQAAAAAihMqAQAAAIDihEoAAAAAoDihEgAAAAAoTqgEAAAAAIoTKgEAAACA4oRKAAAAAKA4oRIAAAAAKE6oBAAAAACKEyoBAAAAgOKESgAAAACgOKESAAAAAChOqAQAAAAAihMqAQAAAIDihEoAAAAAoDihEgAAAAAoTqgEAAAAAIoTKgEAAACA4oRKAAAAAKA4oRIAAAAAKE6oBAAAAACKEyoBAAAAgOKESgAAAACgOKESAAAAAChOqAQAAAAAihMqAQAAAIDihEoAAAAAoDihEgAAAAAoTqgEAAAAAIoTKgEAAACA4oRKAAAAAKA4oRIAAAAAKE6oBAAAAACKEyoBAAAAgOKESgAAAACgOKESAAAAAChOqAQAAAAAihMqAQAAAIDihEoAAAAAoDihEgAAAAAoTqgEAAAAAIoTKgEAAACA4oRKAAAAAKA4oRIAAAAAKE6oBAAAAACKEyoBAAAAgOKESgAAAACgOKESAAAAAChOqAQAAAAAihMqAQAAAIDihEoAAAAAoDihEgAAAAAoTqgEAAAAAIoTKgEAAACA4oRKAAAAAKA4oRIAAAAAKE6oBAAAAACKEyoBAAAAgOKESgAAAACgOKESAAAAAChOqAQAAAAAihMqAQAAAIDihEoAAAAAoDihEgAAAAAoTqgEAAAAAIoTKgEAAACA4oRKAAAAAKA4oRIAAAAAKE6oBAAAAACKEyoBAAAAgOKESgAAAACgOKESAAAAAChOqAQAAAAAihMqAQAAAIDihEoAAAAAoDihEgAAAAAoTqgEAAAAAIoTKgEAAACA4oRKAAAAAKA4oRIAAAAAKE6oBAAAAACKEyoBAAAAgOKESgAAAACgOKESAAAAAChOqAQAAAAAihMqAQAAAIDihEoAAAAAoDihEgAAAAAoTqgEAAAAAIoTKgEAAACA4oRKAAAAAKA4oRIAAAAAKE6oBAAAAACKEyoBAAAAgOKESgAAAACgOKESAAAAAChOqAQAAAAAihMqAQAAAIDihEoAAAAAoDihEgAAAAAoTqgEAAAAAIoTKgEAAACA4oRKAAAAAKA4oRIAAAAAKE6oBAAAAACKEyoBAAAAgOKESgAAAACgOKESAAAAAChOqAQAAAAAihMqAQAAAIDihEoAAAAAoDihEgAAAAAoTqgEAAAAAIoTKgEAAACA4vYrPQDAh91hVywtPQIAAAAU5xWVAAAAAEBxQiUAAAAAUFy3Q+WqVaty1llnpa6uLlVVVbn33nu7HD///PNTVVXV5WvixIld1rz22ms577zzUlNTkyFDhmT69Ol58803u6x5+umn87nPfS4DBgzIyJEjs2DBgm1mueeee3LUUUdlwIABOe6443L//fd3Od7Z2Zm5c+dmxIgRGThwYMaPH5/nn3++uw8ZAAAAANjNuh0qN27cmBNOOCELFy583zUTJ07Myy+/XPn60Y9+1OX4eeedl2eeeSZNTU257777smrVqlx00UWV462trZkwYUJGjRqV1atX57vf/W7mzZuX73//+5U1jzzySM4999xMnz49Tz31VKZMmZIpU6Zk7dq1lTULFizIzTffnEWLFuWxxx7L/vvvn4aGhrz99tvdfdgAAAAAwG7U7ZPpTJo0KZMmTdrumurq6tTW1r7nsV/+8pdZtmxZ/uu//isnnnhikuTv//7v84UvfCF/93d/l7q6utx5553ZtGlTbrvttvTv3z+f/OQns2bNmlx//fWVoHnTTTdl4sSJueyyy5Ik8+fPT1NTU2655ZYsWrQonZ2dufHGG3PllVfmi1/8YpLkH//xHzN8+PDce++9Oeecc7aZra2tLW1tbZXLra2tSZL29va0t7d38yfVO2x5XPvq44M9oaf7qLpv564cB3ql6j6dXf65p3keZF/g7zroOfsIes4+Ymvd+V3YLWf9fuihhzJs2LAceOCB+fznP5/vfOc7Oeigg5Ikzc3NGTJkSCVSJsn48ePTp0+fPPbYY/nSl76U5ubmnHrqqenfv39lTUNDQ/72b/82r7/+eg488MA0NzensbGxy/02NDRU3or+4osvpqWlJePHj68cHzx4cMaNG5fm5ub3DJXXXnttrr766m2uX758eQYNGtSjn8nerqmpqfQI0Ot90H204KRdPAj0YvNP7Chyv1t/fAz0Zv6ug56zj6Dn7CO2eOutt3Z67S4PlRMnTszZZ5+d0aNH54UXXsi3vvWtTJo0Kc3Nzenbt29aWloybNiwrkPst1+GDh2alpaWJElLS0tGjx7dZc3w4cMrxw488MC0tLRUrnv3mnffxru/773WbG3OnDld4mdra2tGjhyZCRMmpKamprs/il6hvb09TU1NOfPMM9OvX7/S40Cv1NN9dOy8B3bDVNC7VPfpzPwTO/LtJ/qkraNqj9//2nkNe/w+YVfzdx30nH0EPWcfsbUt71jeGbs8VL77lYrHHXdcjj/++Bx++OF56KGHcsYZZ+zqu9ulqqurU11dvc31/fr12+c314fhMcLu9kH3UdvmPR9lYG/V1lFVZE94DmRf4u866Dn7CHrOPmKL7vwedPtkOt31sY99LAcffHB+9atfJUlqa2vzyiuvdFnzzjvv5LXXXqt8rmVtbW3Wr1/fZc2Wyzta8+7j7/6+91oDAAAAAOwddnuo/O1vf5tXX301I0aMSJLU19dnw4YNWb16dWXNgw8+mI6OjowbN66yZtWqVV0+bLOpqSlHHnlkDjzwwMqaFStWdLmvpqam1NfXJ0lGjx6d2traLmtaW1vz2GOPVdYAAAAAAHuHbofKN998M2vWrMmaNWuS/PGkNWvWrMm6devy5ptv5rLLLsujjz6aX//611mxYkW++MUv5ogjjkhDwx8/++noo4/OxIkTc+GFF+bxxx/Pf/7nf2bmzJk555xzUldXlyT56le/mv79+2f69Ol55plnctddd+Wmm27q8vmRf/mXf5lly5bluuuuy7PPPpt58+bliSeeyMyZM5MkVVVVmTVrVr7zne/k3/7t3/Lzn/88f/7nf566urpMmTKlhz82AAAAAGBX6vZnVD7xxBM5/fTTK5e3xMNp06ble9/7Xp5++unccccd2bBhQ+rq6jJhwoTMnz+/y2c/3nnnnZk5c2bOOOOM9OnTJ1OnTs3NN99cOT548OAsX748M2bMyNixY3PwwQdn7ty5ueiiiypr/uRP/iRLlizJlVdemW9961v5+Mc/nnvvvTfHHntsZc3ll1+ejRs35qKLLsqGDRtyyimnZNmyZRkwYEB3HzYAAAAAsBt1O1Sedtpp6ezsfN/jDzyw47PXDh06NEuWLNnumuOPPz7/8R//sd01X/7yl/PlL3/5fY9XVVXlmmuuyTXXXLPDmQAAAACAcnb7Z1QCAAAAAOyIUAkAAAAAFCdUAgAAAADFCZUAAAAAQHFCJQAAAABQnFAJAAAAABQnVAIAAAAAxQmVAAAAAEBxQiUAAAAAUJxQCQAAAAAUJ1QCAAAAAMUJlQAAAABAcUIlAAAAAFCcUAkAAAAAFCdUAgAAAADFCZUAAAAAQHFCJQAAAABQnFAJAAAAABQnVAIAAAAAxQmVAAAAAEBxQiUAAAAAUJxQCQAAAAAUJ1QCAAAAAMUJlQAAAABAcUIlAAAAAFCcUAkAAAAAFCdUAgAAAADFCZUAAAAAQHFCJQAAAABQnFAJAAAAABQnVAIAAAAAxQmVAAAAAEBxQiUAAAAAUJxQCQAAAAAUJ1QCAAAAAMUJlQAAAABAcUIlAAAAAFCcUAkAAAAAFCdUAgAAAADFCZUAAAAAQHFCJQAAAABQnFAJAAAAABQnVAIAAAAAxQmVAAAAAEBxQiUAAAAAUJxQCQAAAAAUJ1QCAAAAAMUJlQAAAABAcUIlAAAAAFCcUAkAAAAAFCdUAgAAAADFCZUAAAAAQHFCJQAAAABQnFAJAAAAABQnVAIAAAAAxQmVAAAAAEBxQiUAAAAAUJxQCQAAAAAUJ1QCAAAAAMUJlQAAAABAcUIlAAAAAFCcUAkAAAAAFCdUAgAAAADFCZUAAAAAQHFCJQAAAABQnFAJAAAAABQnVAIAAAAAxQmVAAAAAEBxQiUAAAAAUJxQCQAAAAAUJ1QCAAAAAMUJlQAAAABAcUIlAAAAAFCcUAkAAAAAFCdUAgAAAADFCZUAAAAAQHFCJQAAAABQnFAJAAAAABQnVAIAAAAAxQmVAAAAAEBxQiUAAAAAUJxQCQAAAAAUJ1QCAAAAAMUJlQAAAABAcUIlAAAAAFCcUAkAAAAAFCdUAgAAAADFCZUAAAAAQHFCJQAAAABQnFAJAAAAABQnVAIAAAAAxQmVAAAAAEBxQiUAAAAAUJxQCQAAAAAUJ1QCAAAAAMUJlQAAAABAcUIlAAAAAFCcUAkAAAAAFCdUAgAAAADFCZUAAAAAQHFCJQAAAABQnFAJAAAAABQnVAIAAAAAxQmVAAAAAEBxQiUAAAAAUJxQCQAAAAAUJ1QCAAAAAMUJlQAAAABAcUIlAAAAAFCcUAkAAAAAFCdUAgAAAADFCZUAAAAAQHFCJQAAAABQnFAJAAAAABQnVAIAAAAAxQmVAAAAAEBxQiUAAAAAUJxQCQAAAAAUJ1QCAAAAAMUJlQAAAABAcUIlAAAAAFCcUAkAAAAAFCdUAgAAAADFCZUAAAAAQHFCJQAAAABQnFAJAAAAABQnVAIAAAAAxQmVAAAAAEBxQiUAAAAAUJxQCQAAAAAUJ1QCAAAAAMUJlQAAAABAcUIlAAAAAFCcUAkAAAAAFCdUAgAAAADFCZUAAAAAQHHdDpWrVq3KWWedlbq6ulRVVeXee+/tcryzszNz587NiBEjMnDgwIwfPz7PP/98lzWvvfZazjvvvNTU1GTIkCGZPn163nzzzS5rnn766Xzuc5/LgAEDMnLkyCxYsGCbWe65554cddRRGTBgQI477rjcf//93Z4FAAAAACiv26Fy48aNOeGEE7Jw4cL3PL5gwYLcfPPNWbRoUR577LHsv//+aWhoyNtvv11Zc9555+WZZ55JU1NT7rvvvqxatSoXXXRR5Xhra2smTJiQUaNGZfXq1fnud7+befPm5fvf/35lzSOPPJJzzz0306dPz1NPPZUpU6ZkypQpWbt2bbdmAQAAAADK26+73zBp0qRMmjTpPY91dnbmxhtvzJVXXpkvfvGLSZJ//Md/zPDhw3PvvffmnHPOyS9/+cssW7Ys//Vf/5UTTzwxSfL3f//3+cIXvpC/+7u/S11dXe68885s2rQpt912W/r3759PfvKTWbNmTa6//vpK0LzpppsyceLEXHbZZUmS+fPnp6mpKbfccksWLVq0U7MAAAAAAHuHbofK7XnxxRfT0tKS8ePHV64bPHhwxo0bl+bm5pxzzjlpbm7OkCFDKpEyScaPH58+ffrksccey5e+9KU0Nzfn1FNPTf/+/StrGhoa8rd/+7d5/fXXc+CBB6a5uTmNjY1d7r+hoaHyVvSdmWVrbW1taWtrq1xubW1NkrS3t6e9vb1nP5y91JbHta8+PtgTerqPqvt27spxoFeq7tPZ5Z97mudB9gX+roOes4+g5+wjttad34VdGipbWlqSJMOHD+9y/fDhwyvHWlpaMmzYsK5D7Ldfhg4d2mXN6NGjt7mNLccOPPDAtLS07PB+djTL1q699tpcffXV21y/fPnyDBo06H0e9b6hqamp9AjQ633QfbTgpF08CPRi80/sKHK/W3/ONfRm/q6DnrOPoOfsI7Z46623dnrtLg2Vvd2cOXO6vEqztbU1I0eOzIQJE1JTU1Nwst2nvb09TU1NOfPMM9OvX7/S40Cv1NN9dOy8B3bDVNC7VPfpzPwTO/LtJ/qkraNqj9//2nkNe/w+YVfzdx30nH0EPWcfsbUt71jeGbs0VNbW1iZJ1q9fnxEjRlSuX79+fcaMGVNZ88orr3T5vnfeeSevvfZa5ftra2uzfv36Lmu2XN7Rmncf39EsW6uurk51dfU21/fr12+f31wfhscIu9sH3Udtm/d8lIG9VVtHVZE94TmQfYm/66Dn7CPoOfuILbrze9Dts35vz+jRo1NbW5sVK1ZUrmttbc1jjz2W+vr6JEl9fX02bNiQ1atXV9Y8+OCD6ejoyLhx4yprVq1a1eU97E1NTTnyyCNz4IEHVta8+362rNlyPzszCwAAAACwd+h2qHzzzTezZs2arFmzJskfT1qzZs2arFu3LlVVVZk1a1a+853v5N/+7d/y85//PH/+53+eurq6TJkyJUly9NFHZ+LEibnwwgvz+OOP5z//8z8zc+bMnHPOOamrq0uSfPWrX03//v0zffr0PPPMM7nrrrty0003dXlb9l/+5V9m2bJlue666/Lss89m3rx5eeKJJzJz5swk2alZAAAAAIC9Q7ff+v3EE0/k9NNPr1zeEg+nTZuWxYsX5/LLL8/GjRtz0UUXZcOGDTnllFOybNmyDBgwoPI9d955Z2bOnJkzzjgjffr0ydSpU3PzzTdXjg8ePDjLly/PjBkzMnbs2Bx88MGZO3duLrroosqaP/mTP8mSJUty5ZVX5lvf+lY+/vGP5957782xxx5bWbMzswAAAAAA5XU7VJ522mnp7Ox83+NVVVW55pprcs0117zvmqFDh2bJkiXbvZ/jjz8+//Ef/7HdNV/+8pfz5S9/uUezAAAAAADl7dLPqAQAAAAA+CCESgAAAACgOKESAAAAAChOqAQAAAAAihMqAQAAAIDihEoAAAAAoDihEgAAAAAoTqgEAAAAAIoTKgEAAACA4oRKAAAAAKA4oRIAAAAAKE6oBAAAAACKEyoBAAAAgOKESgAAAACgOKESAAAAAChOqAQAAAAAihMqAQAAAIDihEoAAAAAoDihEgAAAAAoTqgEAAAAAIoTKgEAAACA4oRKAAAAAKA4oRIAAAAAKE6oBAAAAACKEyoBAAAAgOKESgAAAACgOKESAAAAAChOqAQAAAAAihMqAQAAAIDihEoAAAAAoDihEgAAAAAoTqgEAAAAAIoTKgEAAACA4oRKAAAAAKA4oRIAAAAAKE6oBAAAAACKEyoBAAAAgOKESgAAAACgOKESAAAAAChOqAQAAAAAihMqAQAAAIDihEoAAAAAoDihEgAAAAAoTqgEAAAAAIoTKgEAAACA4oRKAAAAAKA4oRIAAAAAKE6oBAAAAACKEyoBAAAAgOKESgAAAACgOKESAAAAAChOqAQAAAAAihMqAQAAAIDihEoAAAAAoDihEgAAAAAoTqgEAAAAAIoTKgEAAACA4oRKAAAAAKA4oRIAAAAAKE6oBAAAAACKEyoBAAAAgOKESgAAAACgOKESAAAAAChOqAQAAAAAihMqAQAAAIDihEoAAAAAoDihEgAAAAAoTqgEAAAAAIoTKgEAAACA4oRKAAAAAKA4oRIAAAAAKE6oBAAAAACKEyoBAAAAgOKESgAAAACgOKESAAAAAChOqAQAAAAAihMqAQAAAIDihEoAAAAAoDihEgAAAAAoTqgEAAAAAIoTKgEAAACA4oRKAAAAAKA4oRIAAAAAKE6oBAAAAACKEyoBAAAAgOKESgAAAACgOKESAAAAAChOqAQAAAAAihMqAQAAAIDihEoAAAAAoDihEgAAAAAoTqgEAAAAAIoTKgEAAACA4oRKAAAAAKA4oRIAAAAAKE6oBAAAAACKEyoBAAAAgOKESgAAAACgOKESAAAAAChOqAQAAAAAihMqAQAAAIDihEoAAAAAoDihEgAAAAAoTqgEAAAAAIoTKgEAAACA4oRKAAAAAKA4oRIAAAAAKE6oBAAAAACKEyoBAAAAgOKESgAAAACgOKESAAAAAChOqAQAAAAAihMqAQAAAIDihEoAAAAAoDihEgAAAAAoTqgEAAAAAIoTKgEAAACA4oRKAAAAAKA4oRIAAAAAKE6oBAAAAACKEyoBAAAAgOKESgAAAACgOKESAAAAAChOqAQAAAAAihMqAQAAAIDihEoAAAAAoDihEgAAAAAoTqgEAAAAAIoTKgEAAACA4oRKAAAAAKA4oRIAAAAAKE6oBAAAAACKEyoBAAAAgOKESgAAAACgOKESAAAAAChOqAQAAAAAitvloXLevHmpqqrq8nXUUUdVjr/99tuZMWNGDjrooHzkIx/J1KlTs379+i63sW7dukyePDmDBg3KsGHDctlll+Wdd97psuahhx7Kpz/96VRXV+eII47I4sWLt5ll4cKFOeywwzJgwICMGzcujz/++K5+uAAAAADALrBbXlH5yU9+Mi+//HLl6+GHH64cmz17dn7605/mnnvuycqVK/PSSy/l7LPPrhzfvHlzJk+enE2bNuWRRx7JHXfckcWLF2fu3LmVNS+++GImT56c008/PWvWrMmsWbPy9a9/PQ888EBlzV133ZXGxsZcddVVefLJJ3PCCSekoaEhr7zyyu54yAAAAABAD+yWULnffvultra28nXwwQcnSd5444388Ic/zPXXX5/Pf/7zGTt2bG6//fY88sgjefTRR5Mky5cvzy9+8Yv80z/9U8aMGZNJkyZl/vz5WbhwYTZt2pQkWbRoUUaPHp3rrrsuRx99dGbOnJk//dM/zQ033FCZ4frrr8+FF16YCy64IMccc0wWLVqUQYMG5bbbbtsdDxkAAAAA6IH9dseNPv/886mrq8uAAQNSX1+fa6+9NoceemhWr16d9vb2jB8/vrL2qKOOyqGHHprm5uacfPLJaW5uznHHHZfhw4dX1jQ0NOSSSy7JM888k0996lNpbm7uchtb1syaNStJsmnTpqxevTpz5sypHO/Tp0/Gjx+f5ubm9527ra0tbW1tlcutra1Jkvb29rS3t/foZ7K32vK49tXHB3tCT/dRdd/OXTkO9ErVfTq7/HNP8zzIvsDfddBz9hH0nH3E1rrzu7DLQ+W4ceOyePHiHHnkkXn55Zdz9dVX53Of+1zWrl2blpaW9O/fP0OGDOnyPcOHD09LS0uSpKWlpUuk3HJ8y7HtrWltbc0f/vCHvP7669m8efN7rnn22Wffd/Zrr702V1999TbXL1++PIMGDdq5H0Av1dTUVHoE6PU+6D5acNIuHgR6sfkndhS53/vvv7/I/cLu4O866Dn7CHrOPmKLt956a6fX7vJQOWnSpMq/H3/88Rk3blxGjRqVu+++OwMHDtzVd7dLzZkzJ42NjZXLra2tGTlyZCZMmJCampqCk+0+7e3taWpqyplnnpl+/fqVHgd6pZ7uo2PnPbDjRbCPq+7TmfknduTbT/RJW0fVHr//tfMa9vh9wq7m7zroOfsIes4+Ymtb3rG8M3bLW7/fbciQIfnEJz6RX/3qVznzzDOzadOmbNiwocurKtevX5/a2tokSW1t7TZn595yVvB3r9n6TOHr169PTU1NBg4cmL59+6Zv377vuWbLbbyX6urqVFdXb3N9v3799vnN9WF4jLC7fdB91LZ5z0cZ2Fu1dVQV2ROeA9mX+LsOes4+gp6zj9iiO78Hu+VkOu/25ptv5oUXXsiIESMyduzY9OvXLytWrKgcf+6557Ju3brU19cnSerr6/Pzn/+8y9m5m5qaUlNTk2OOOaay5t23sWXNltvo379/xo4d22VNR0dHVqxYUVkDAAAAAOw9dnmo/MY3vpGVK1fm17/+dR555JF86UtfSt++fXPuuedm8ODBmT59ehobG/Pv//7vWb16dS644ILU19fn5JNPTpJMmDAhxxxzTL72ta/lv//7v/PAAw/kyiuvzIwZMyqvdrz44ovzP//zP7n88svz7LPP5tZbb83dd9+d2bNnV+ZobGzMP/zDP+SOO+7IL3/5y1xyySXZuHFjLrjggl39kAEAAACAHtrlb/3+7W9/m3PPPTevvvpqPvrRj+aUU07Jo48+mo9+9KNJkhtuuCF9+vTJ1KlT09bWloaGhtx6662V7+/bt2/uu+++XHLJJamvr8/++++fadOm5ZprrqmsGT16dJYuXZrZs2fnpptuyiGHHJIf/OAHaWj4/58v9ZWvfCW/+93vMnfu3LS0tGTMmDFZtmzZNifYAQAAAADK2+Wh8p//+Z+3e3zAgAFZuHBhFi5c+L5rRo0atcOzb5522ml56qmntrtm5syZmTlz5nbXAAAAAADl7fbPqAQAAAAA2BGhEgAAAAAoTqgEAAAAAIrb5Z9RCQDQ2xx2xdLSI9ADv/6byaVHAABgF/CKSgAAAACgOKESAAAAAChOqAQAAAAAihMqAQAAAIDihEoAAAAAoDihEgAAAAAoTqgEAAAAAIoTKgEAAACA4oRKAAAAAKA4oRIAAAAAKE6oBAAAAACKEyoBAAAAgOKESgAAAACgOKESAAAAAChOqAQAAAAAihMqAQAAAIDihEoAAAAAoDihEgAAAAAoTqgEAAAAAIoTKgEAAACA4oRKAAAAAKA4oRIAAAAAKE6oBAAAAACKEyoBAAAAgOKESgAAAACgOKESAAAAAChOqAQAAAAAihMqAQAAAIDihEoAAAAAoDihEgAAAAAoTqgEAAAAAIoTKgEAAACA4oRKAAAAAKA4oRIAAAAAKE6oBAAAAACKEyoBAAAAgOKESgAAAACgOKESAAAAAChOqAQAAAAAihMqAQAAAIDihEoAAAAAoDihEgAAAAAoTqgEAAAAAIoTKgEAAACA4oRKAAAAAKA4oRIAAAAAKE6oBAAAAACKEyoBAAAAgOKESgAAAACgOKESAAAAAChOqAQAAAAAihMqAQAAAIDihEoAAAAAoDihEgAAAAAoTqgEAAAAAIrbr/QAAADQE4ddsbT0CHuF6r6dWXBScuy8B9K2uar0ON3y67+ZXHoEAGAv4BWVAAAAAEBxQiUAAAAAUJxQCQAAAAAUJ1QCAAAAAMUJlQAAAABAcUIlAAAAAFCcUAkAAAAAFCdUAgAAAADFCZUAAAAAQHFCJQAAAABQnFAJAAAAABQnVAIAAAAAxQmVAAAAAEBxQiUAAAAAUJxQCQAAAAAUJ1QCAAAAAMUJlQAAAABAcUIlAAAAAFCcUAkAAAAAFCdUAgAAAADFCZUAAAAAQHFCJQAAAABQnFAJAAAAABQnVAIAAAAAxQmVAAAAAEBxQiUAAAAAUJxQCQAAAAAUJ1QCAAAAAMUJlQAAAABAcUIlAAAAAFCcUAkAAAAAFCdUAgAAAADFCZUAAAAAQHFCJQAAAABQnFAJAAAAABQnVAIAAAAAxQmVAAAAAEBxQiUAAAAAUJxQCQAAAAAUJ1QCAAAAAMUJlQAAAABAcUIlAAAAAFCcUAkAAAAAFCdUAgAAAADFCZUAAAAAQHFCJQAAAABQnFAJAAAAABQnVAIAAAAAxQmVAAAAAEBxQiUAAAAAUJxQCQAAAAAUJ1QCAAAAAMUJlQAAAABAcUIlAAAAAFCcUAkAAAAAFCdUAgAAAADFCZUAAAAAQHFCJQAAAABQnFAJAAAAABQnVAIAAAAAxQmVAAAAAEBxQiUAAAAAUJxQCQAAAAAUt1/pAQAAgA+3w65YWnoEeujXfzO59AgA7AO8ohIAAAAAKE6oBAAAAACKEyoBAAAAgOKESgAAAACgOKESAAAAAChOqAQAAAAAihMqAQAAAIDi9is9AAAAAFDWYVcsLT0CPfDrv5lcegTYJbyiEgAAAAAoTqgEAAAAAIoTKgEAAACA4oRKAAAAAKA4J9MBAAAA6MX2ppMhVfftzIKTkmPnPZC2zVWlx+k1nBDpjz4Ur6hcuHBhDjvssAwYMCDjxo3L448/XnokAAAAAOBd9vlQedddd6WxsTFXXXVVnnzyyZxwwglpaGjIK6+8Uno0AAAAAOD/7PNv/b7++utz4YUX5oILLkiSLFq0KEuXLs1tt92WK664osvatra2tLW1VS6/8cYbSZLXXnst7e3te27oPai9vT1vvfVWXn311fTr16/0ONAr9XQf7ffOxt0wFfQu+3V05q23OrJfe59s7vAWIfgg7CNKevXVV0uPsEt8mP/3kb9J2VU8H30w+8p/R9/L73//+yRJZ2fnDtdWde7Mql5q06ZNGTRoUP7lX/4lU6ZMqVw/bdq0bNiwIT/5yU+6rJ83b16uvvrqPTwlAAAAAOzbfvOb3+SQQw7Z7pp9+hWV//u//5vNmzdn+PDhXa4fPnx4nn322W3Wz5kzJ42NjZXLHR0dee2113LQQQelqmrf/H8BWltbM3LkyPzmN79JTU1N6XGgV7KPoOfsI+g5+wh6zj6CnrOP2FpnZ2d+//vfp66ubodr9+lQ2V3V1dWprq7uct2QIUPKDLOH1dTU+A8I9JB9BD1nH0HP2UfQc/YR9Jx9xLsNHjx4p9bt0yfTOfjgg9O3b9+sX7++y/Xr169PbW1toakAAAAAgK3t06Gyf//+GTt2bFasWFG5rqOjIytWrEh9fX3ByQAAAACAd9vn3/rd2NiYadOm5cQTT8xJJ52UG2+8MRs3bqycBfzDrrq6OlddddU2b3kHdp59BD1nH0HP2UfQc/YR9Jx9RE/s02f93uKWW27Jd7/73bS0tGTMmDG5+eabM27cuNJjAQAAAAD/50MRKgEAAACAvds+/RmVAAAAAEDvIFQCAAAAAMUJlQAAAABAcUIlAAAAAFCcUPkhtnDhwhx22GEZMGBAxo0bl8cff7z0SNBrzJs3L1VVVV2+jjrqqNJjwV5v1apVOeuss1JXV5eqqqrce++9XY53dnZm7ty5GTFiRAYOHJjx48fn+eefLzMs7KV2tI/OP//8bZ6jJk6cWGZY2Atde+21+cxnPpMDDjggw4YNy5QpU/Lcc891WfP2229nxowZOeigg/KRj3wkU6dOzfr16wtNDHufndlHp5122jbPRxdffHGhiekthMoPqbvuuiuNjY256qqr8uSTT+aEE05IQ0NDXnnlldKjQa/xyU9+Mi+//HLl6+GHHy49Euz1Nm7cmBNOOCELFy58z+MLFizIzTffnEWLFuWxxx7L/vvvn4aGhrz99tt7eFLYe+1oHyXJxIkTuzxH/ehHP9qDE8LebeXKlZkxY0YeffTRNDU1pb29PRMmTMjGjRsra2bPnp2f/vSnueeee7Jy5cq89NJLOfvsswtODXuXndlHSXLhhRd2eT5asGBBoYnpLao6Ozs7Sw/Bnjdu3Lh85jOfyS233JIk6ejoyMiRI3PppZfmiiuuKDwd7P3mzZuXe++9N2vWrCk9CvRaVVVV+fGPf5wpU6Yk+eOrKevq6vJXf/VX+cY3vpEkeeONNzJ8+PAsXrw455xzTsFpYe+09T5K/viKyg0bNmzzSkvgvf3ud7/LsGHDsnLlypx66ql544038tGPfjRLlizJn/7pnyZJnn322Rx99NFpbm7OySefXHhi2PtsvY+SP76icsyYMbnxxhvLDkev4hWVH0KbNm3K6tWrM378+Mp1ffr0yfjx49Pc3FxwMuhdnn/++dTV1eVjH/tYzjvvvKxbt670SNCrvfjii2lpaeny/DR48OCMGzfO8xN000MPPZRhw4blyCOPzCWXXJJXX3219Eiw13rjjTeSJEOHDk2SrF69Ou3t7V2ej4466qgceuihno/gfWy9j7a48847c/DBB+fYY4/NnDlz8tZbb5UYj15kv9IDsOf97//+bzZv3pzhw4d3uX748OF59tlnC00Fvcu4ceOyePHiHHnkkXn55Zdz9dVX53Of+1zWrl2bAw44oPR40Cu1tLQkyXs+P205BuzYxIkTc/bZZ2f06NF54YUX8q1vfSuTJk1Kc3Nz+vbtW3o82Kt0dHRk1qxZ+exnP5tjjz02yR+fj/r3758hQ4Z0Wev5CN7be+2jJPnqV7+aUaNGpa6uLk8//XS++c1v5rnnnsu//uu/FpyWvZ1QCfABTJo0qfLvxx9/fMaNG5dRo0bl7rvvzvTp0wtOBsCH3bs/JuG4447L8ccfn8MPPzwPPfRQzjjjjIKTwd5nxowZWbt2rc8ahx54v3100UUXVf79uOOOy4gRI3LGGWfkhRdeyOGHH76nx6SX8NbvD6GDDz44ffv23easdevXr09tbW2hqaB3GzJkSD7xiU/kV7/6VelRoNfa8hzk+Ql2rY997GM5+OCDPUfBVmbOnJn77rsv//7v/55DDjmkcn1tbW02bdqUDRs2dFnv+Qi29X776L2MGzcuSTwfsV1C5YdQ//79M3bs2KxYsaJyXUdHR1asWJH6+vqCk0Hv9eabb+aFF17IiBEjSo8Cvdbo0aNTW1vb5fmptbU1jz32mOcn6IHf/va3efXVVz1Hwf/p7OzMzJkz8+Mf/zgPPvhgRo8e3eX42LFj069fvy7PR88991zWrVvn+Qj+z4720XvZciJSz0dsj7d+f0g1NjZm2rRpOfHEE3PSSSflxhtvzMaNG3PBBReUHg16hW984xs566yzMmrUqLz00ku56qqr0rdv35x77rmlR4O92ptvvtnl/0V/8cUXs2bNmgwdOjSHHnpoZs2ale985zv5+Mc/ntGjR+fb3/526urqupzRGD7strePhg4dmquvvjpTp05NbW1tXnjhhVx++eU54ogj0tDQUHBq2HvMmDEjS5YsyU9+8pMccMABlc+dHDx4cAYOHJjBgwdn+vTpaWxszNChQ1NTU5NLL7009fX1zvgN/2dH++iFF17IkiVL8oUvfCEHHXRQnn766cyePTunnnpqjj/++MLTszer6uzs7Cw9BGXccsst+e53v5uWlpaMGTMmN998c+Wl2MD2nXPOOVm1alVeffXVfPSjH80pp5ySv/7rv/ZZK7ADDz30UE4//fRtrp82bVoWL16czs7OXHXVVfn+97+fDRs25JRTTsmtt96aT3ziEwWmhb3T9vbR9773vUyZMiVPPfVUNmzYkLq6ukyYMCHz58/f5kRV8GFVVVX1ntfffvvtOf/885Mkb7/9dv7qr/4qP/rRj9LW1paGhobceuut3voN/2dH++g3v/lN/uzP/ixr167Nxo0bM3LkyHzpS1/KlVdemZqamj08Lb2JUAkAAAAAFOczKgEAAACA4oRKAAAAAKA4oRIAAAAAKE6oBAAAAACKEyoBAAAAgOKESgAAAACgOKESAAAAAChOqAQAAAAAihMqAQAAAIDihEoAAAAAoDihEgAAAAAo7v8Bfrk/p9DJsPAAAAAASUVORK5CYII=", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "examples.dataframe.departure_mean_speed_1d.hist()" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "8e870eb3", + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "import numpy as np # linear algebra\n", + "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", + "from datetime import timedelta\n", + "import datetime as dt\n", + "import matplotlib.pyplot as plt\n", + "plt.rcParams['figure.figsize'] = [16, 10]\n", + "import seaborn as sns\n", + "import xgboost as xgb\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.decomposition import PCA\n", + "from sklearn.cluster import MiniBatchKMeans\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.compose import ColumnTransformer\n", + "import warnings\n", + "warnings.filterwarnings('ignore')" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "id": "692e0d5e", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "ValueError: A given column is not a column of the dataframe\n" + ] + } + ], + "source": [ + "X = examples.dataframe.drop([\"target\", \"_key\", \"_key_hash\", \"_subsort\", \"_time\"], axis=1)\n", + "y = examples.dataframe[\"target\"]\n", + "\n", + "\n", + "columns_to_encode = [\n", + " 'hvfhs_license_num', 'dispatching_base_num', 'originating_base_num', \n", + " 'pu_location_id', 'do_location_id', 'shared_request_flag', 'shared_match_flag', \n", + " 'access_a_ride_flag', 'wav_request_flag', 'wav_match_flag', 'pu_zone', 'pu_borough', \n", + " 'do_zone', 'do_borough', 'monthday', \n", + "]\n", + "ct = ColumnTransformer(\n", + " [('encoder', OneHotEncoder(sparse=False), columns_to_encode)],\n", + " remainder='passthrough' \n", + ")\n", + "X = ct.fit_transform(X)\n", + "\n", + "Xtr, Xv, ytr, yv = train_test_split(X, y, train_size=50000, test_size=10000, random_state=42)\n", + "dtrain = xgb.DMatrix(Xtr, label=ytr)\n", + "dvalid = xgb.DMatrix(Xv, label=yv)\n", + "watchlist = [(dtrain, 'train'), (dvalid, 'valid')]\n", + "\n", + "# Try different parameters! My favorite is random search :)\n", + "xgb_pars = {'min_child_weight': 50, 'eta': 0.3, 'colsample_bytree': 0.7, 'max_depth': 5,\n", + " 'subsample': 0.7, 'lambda': 1., 'nthread': 4, 'booster' : 'gbtree',\n", + " 'eval_metric': 'rmsle', 'objective': 'reg:linear'}\n", + "\n", + "# 0.28976 leader, 0.37195 10%\n", + "model = xgb.train(xgb_pars, dtrain, 60, watchlist, early_stopping_rounds=50,\n", + " maximize=False, verbose_eval=10)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/nyc-taxi/README.md b/examples/nyc-taxi/README.md new file mode 100644 index 000000000..0a3e3ccc7 --- /dev/null +++ b/examples/nyc-taxi/README.md @@ -0,0 +1,126 @@ +# NYC Taxi example + +This notebook and dataset are intended to help you get started writing queries quickly. + +The included notebook sets up Kaskada, creates a table for the data, and loads the data. + +You can use the notebook in Docker by running the following command in this directory, which will download a docker container with Jupyter and Kaskada pre-installed and launch the Jupyter server. + + +```sh +docker run --rm -p 8888:8888 -v "$PWD:/home/jovyan/example" kaskadaio/jupyter +```` + +At the end of the log output you should see a URL like `http://127.0.0.1:8888/lab?token=d7f0cab9929e1b499b66fd3308357ed62dbb524db1ffe394`: + +``` +... +[I 2023-05-03 14:41:29.593 ServerApp] Jupyter Server 2.5.0 is running at: +[I 2023-05-03 14:41:29.593 ServerApp] http://756b93a11d10:8888/lab?token=d7f0cab9929e1b499b66fd3308357ed62dbb524db1ffe394 +[I 2023-05-03 14:41:29.593 ServerApp] http://127.0.0.1:8888/lab?token=d7f0cab9929e1b499b66fd3308357ed62dbb524db1ffe394 +[I 2023-05-03 14:41:29.593 ServerApp] Use Control-C to stop this server and shut down all kernels (twice to skip confirmation). +[C 2023-05-03 14:41:29.595 ServerApp] + + To access the server, open this file in a browser: + file:///home/jovyan/.local/share/jupyter/runtime/jpserver-7-open.html + Or copy and paste one of these URLs: + http://756b93a11d10:8888/lab?token=d7f0cab9929e1b499b66fd3308357ed62dbb524db1ffe394 + http://127.0.0.1:8888/lab?token=d7f0cab9929e1b499b66fd3308357ed62dbb524db1ffe394 +``` + +Copy the URL into your brower, and you should see the Jupyter UI. In the file browser on the left, open the `example` folder and double-click on `Notebook.ipynb`. + +Run the cells in the notebook to setup Kaskada. The last cell, which begins with `%%fenl` allows you to query the data by writing a query starting on the line after `%%fenl` and running the cell. + + +## Preprocessing the data + +The included dataset is derived from the [NYC Taxi Trip Data](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page) provided by the NYC. +The raw data has been cleaned using the following queries in DuckDB + +```sql +INSTALL spatial; +INSTALL parquet; +LOAD spatial; +LOAD parquet; + +CREATE TABLE rides AS SELECT * +FROM './.parquet'; + +CREATE TABLE zones AS SELECT zone, LocationId, borough, ST_GeomFromWKB(wkb_geometry) AS geom +FROM ST_Read('./spatial/test/data/nyc_taxi/taxi_zones/taxi_zones.shx'); + +copy ( + select + hvfhs_license_num, + dispatching_base_num, + originating_base_num, + request_datetime, + on_scene_datetime, + pickup_datetime, + dropoff_datetime, + PULocationID AS pu_location_id, + DOLocationID AS do_location_id, + trip_miles, + trip_time, + base_passenger_fare, + tolls, + bcf, + sales_tax, + congestion_surcharge, + airport_fee, + tips, + driver_pay, + shared_request_flag, + shared_match_flag, + access_a_ride_flag, + wav_request_flag, + wav_match_flag, + PUZone.zone AS pu_zone, + PUZone.borough AS pu_borough, + DOZone.zone AS do_zone, + DOZone.borough AS do_borough, + ST_Distance( ST_Centroid(PUZone.geom), ST_Centroid(DOZone.geom)) / 5280 AS distance_miles, + + from 'fhvhv_tripdata_2023-02.parquet' + join zones as PUZone on PULocationID = PUZone.LocationID + join zones as DOZone on DOLocationID = DOZone.LocationID +) TO 'fhvhv_combined.parquet' (FORMAT PARQUET); + +copy ( + select + hvfhs_license_num, + dispatching_base_num, + originating_base_num, + request_datetime, + on_scene_datetime, + pickup_datetime, + -- dropoff_datetime, The date and time of the trip drop-off + PULocationID AS pu_location_id, + DOLocationID AS do_location_id, + trip_miles, + -- trip_time, total time in seconds for passenger trip + -- base_passenger_fare, base passenger fare before tolls, tips, taxes, and fees + -- tolls, total amount of all tolls paid in trip + -- bcf, total amount collected in trip for Black Car Fund + -- sales_tax, total amount collected in trip for NYS sales tax + -- congestion_surcharge, total amount collected in trip for NYS congestion surcharge + -- airport_fee, $2.50 for both drop off and pick up at LaGuardia, Newark, and John F. Kennedy airports + -- tips, total amount of tips received from passenger + -- driver_pay, total driver pay (not including tolls or tips and net of commission, surcharges, or taxes) + shared_request_flag, + shared_match_flag, + access_a_ride_flag, + wav_request_flag, + wav_match_flag, + PUZone.zone AS pu_zone, + PUZone.borough AS pu_borough, + DOZone.zone AS do_zone, + DOZone.borough AS do_borough, + ST_Distance( ST_Centroid(PUZone.geom), ST_Centroid(DOZone.geom)) / 5280 AS distance_miles, + + from 'fhvhv_tripdata_2023-02.parquet' + join zones as PUZone on PULocationID = PUZone.LocationID + join zones as DOZone on DOLocationID = DOZone.LocationID +) TO 'fhvhv_pickups.parquet' (FORMAT PARQUET); +``` \ No newline at end of file diff --git a/examples/nyc-taxi/fhvhv_combined_100000.parquet b/examples/nyc-taxi/fhvhv_combined_100000.parquet new file mode 100644 index 000000000..cceb4b060 Binary files /dev/null and b/examples/nyc-taxi/fhvhv_combined_100000.parquet differ diff --git a/examples/nyc-taxi/fhvhv_pickups_100000.parquet b/examples/nyc-taxi/fhvhv_pickups_100000.parquet new file mode 100644 index 000000000..30f4981ba Binary files /dev/null and b/examples/nyc-taxi/fhvhv_pickups_100000.parquet differ