diff --git a/02_activities/assignments/assignment_1.ipynb b/02_activities/assignments/assignment_1.ipynb index 45cfc9cd7..38dfdf0de 100644 --- a/02_activities/assignments/assignment_1.ipynb +++ b/02_activities/assignments/assignment_1.ipynb @@ -26,19 +26,30 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# Write your code below.\n", - "\n" + "\n", + "%load_ext dotenv\n", + "%dotenv " ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\shail\\.conda\\envs\\dsi_participant\\lib\\site-packages\\dask\\dataframe\\_pyarrow_compat.py:15: FutureWarning: Minimal version of pyarrow will soon be increased to 14.0.1. You are using 11.0.0. Please consider upgrading.\n", + " warnings.warn(\n" + ] + } + ], "source": [ "import dask.dataframe as dd" ] @@ -55,14 +66,101 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 2897 total parquet files under ../../05_src/data/prices/\n", + "Using first 60 → sample_files list length: 60\n", + " ../../05_src/data/prices\\ACHV\\ACHV_1995\\part.0.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_1995\\part.1.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_1996\\part.0.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_1996\\part.1.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_1997\\part.0.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_1997\\part.1.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_1998\\part.0.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_1998\\part.1.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_1999\\part.0.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_1999\\part.1.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2000\\part.0.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2000\\part.1.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2001\\part.0.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2001\\part.1.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2002\\part.0.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2002\\part.1.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2003\\part.0.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2003\\part.1.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2004\\part.0.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2004\\part.1.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2005\\part.0.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2005\\part.1.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2006\\part.0.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2006\\part.1.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2007\\part.0.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2007\\part.1.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2008\\part.0.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2008\\part.1.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2009\\part.0.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2009\\part.1.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2010\\part.0.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2010\\part.1.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2011\\part.0.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2011\\part.1.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2012\\part.0.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2012\\part.1.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2013\\part.0.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2013\\part.1.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2014\\part.0.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2014\\part.1.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2015\\part.0.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2015\\part.1.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2016\\part.0.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2016\\part.1.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2017\\part.0.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2017\\part.1.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2018\\part.0.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2018\\part.1.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2019\\part.0.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2019\\part.1.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2020\\part.0.parquet\n", + " ../../05_src/data/prices\\ACHV\\ACHV_2020\\part.1.parquet\n", + " ../../05_src/data/prices\\ACN\\ACN_2001\\part.0.parquet\n", + " ../../05_src/data/prices\\ACN\\ACN_2001\\part.1.parquet\n", + " ../../05_src/data/prices\\ACN\\ACN_2002\\part.0.parquet\n", + " ../../05_src/data/prices\\ACN\\ACN_2002\\part.1.parquet\n", + " ../../05_src/data/prices\\ACN\\ACN_2003\\part.0.parquet\n", + " ../../05_src/data/prices\\ACN\\ACN_2003\\part.1.parquet\n", + " ../../05_src/data/prices\\ACN\\ACN_2004\\part.0.parquet\n", + " ../../05_src/data/prices\\ACN\\ACN_2004\\part.1.parquet\n" + ] + } + ], "source": [ + "\n", "import os\n", "from glob import glob\n", "\n", - "# Write your code below.\n", + "# PRICE_DATA directory \n", + "price_data_dir = os.getenv(\"PRICE_DATA\")\n", + "if not price_data_dir:\n", + " raise RuntimeError(\"PRICE_DATA not set. Did you run %dotenv?\")\n", + "\n", + "#Glob for every .parquet (recursively)\n", + "all_files = glob(os.path.join(price_data_dir, \"**\", \"*.parquet\"), recursive=True)\n", + "\n", + "#Take only the first 60\n", + "sample_files = all_files[:60]\n", + "\n", + "#check\n", + "print(f\"Found {len(all_files)} total parquet files under {price_data_dir}\")\n", + "print(f\"Using first 60 → sample_files list length: {len(sample_files)}\")\n", + "for path in sample_files:\n", + " print(\" \", path)\n", + "\n", + "\n", "\n" ] }, @@ -88,11 +186,72 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\shail\\AppData\\Local\\Temp\\ipykernel_6524\\3770542123.py:12: UserWarning: `meta` is not specified, inferred from partial data. Please provide `meta` if the result is unexpected.\n", + " Before: .apply(func)\n", + " After: .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result\n", + " or: .apply(func, meta=('x', 'f8')) for series result\n", + " Close_lag_1 = lambda df: df.groupby(\"ticker\")[\"Close\"].shift(1),\n", + "C:\\Users\\shail\\AppData\\Local\\Temp\\ipykernel_6524\\3770542123.py:13: UserWarning: `meta` is not specified, inferred from partial data. Please provide `meta` if the result is unexpected.\n", + " Before: .apply(func)\n", + " After: .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result\n", + " or: .apply(func, meta=('x', 'f8')) for series result\n", + " Adj_Close_lag_1 = lambda df: df.groupby(\"ticker\")[\"Adj_Close\"].shift(1),\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Date Open High Low Close Adj_Close Volume source \\\n", + "0 1995-10-12 14850.0 14850.0 13365.0 13612.5 13612.5 100 ACHV.csv \n", + "1 1995-10-13 14355.0 14355.0 13365.0 14107.5 14107.5 0 ACHV.csv \n", + "2 1995-10-16 14355.0 14355.0 13365.0 14355.0 14355.0 0 ACHV.csv \n", + "3 1995-10-17 13612.5 14355.0 13365.0 13860.0 13860.0 0 ACHV.csv \n", + "4 1995-10-18 14107.5 14850.0 13365.0 14107.5 14107.5 0 ACHV.csv \n", + "\n", + " ticker Year Close_lag_1 Adj_Close_lag_1 returns hi_lo_range \n", + "0 ACHV 1995 NaN NaN NaN 1485.0 \n", + "1 ACHV 1995 NaN NaN NaN 990.0 \n", + "2 ACHV 1995 NaN NaN NaN 990.0 \n", + "3 ACHV 1995 NaN NaN NaN 990.0 \n", + "4 ACHV 1995 NaN NaN NaN 1485.0 \n" + ] + } + ], "source": [ - "# Write your code below.\n", + "\n", + "\n", + "# 1. Read first 60 files\n", + "ddf = dd.read_parquet(sample_files, engine=\"pyarrow\")\n", + "\n", + "# 2. Rename “Adj Close” to a Python‐friendly name\n", + "ddf = ddf.rename(columns={\"Adj Close\": \"Adj_Close\"})\n", + "\n", + "# 3. Build your lag, returns and hi_lo_range features\n", + "dd_feat = (\n", + " ddf\n", + " .assign(\n", + " Close_lag_1 = lambda df: df.groupby(\"ticker\")[\"Close\"].shift(1),\n", + " Adj_Close_lag_1 = lambda df: df.groupby(\"ticker\")[\"Adj_Close\"].shift(1),\n", + " )\n", + " .assign(\n", + " returns = lambda df: (df[\"Close\"] / df[\"Close_lag_1\"]) - 1,\n", + " hi_lo_range = lambda df: df[\"High\"] - df[\"Low\"],\n", + " )\n", + " .persist()\n", + ")\n", + "\n", + "# 4. Inspect\n", + "print(dd_feat.head())\n", + "\n", + "\n", "\n" ] }, @@ -108,11 +267,50 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " ticker Date returns returns_ma_10\n", + "0 ACHV 1995-10-12 NaN NaN\n", + "1 ACHV 1995-10-13 NaN NaN\n", + "2 ACHV 1995-10-16 NaN NaN\n", + "3 ACHV 1995-10-17 NaN NaN\n", + "4 ACHV 1995-10-18 NaN NaN\n", + "5 ACHV 1995-10-19 NaN NaN\n", + "6 ACHV 1995-10-20 NaN NaN\n", + "7 ACHV 1995-10-23 NaN NaN\n", + "8 ACHV 1995-10-24 NaN NaN\n", + "9 ACHV 1995-10-25 NaN NaN\n", + "10 ACHV 1995-10-26 NaN NaN\n", + "11 ACHV 1995-10-27 NaN NaN\n", + "12 ACHV 1995-10-30 NaN NaN\n", + "13 ACHV 1995-10-31 NaN NaN\n", + "14 ACHV 1995-11-01 NaN NaN\n" + ] + } + ], "source": [ - "# Write your code below.\n", + "# 1. Compute the Dask DataFrame into pandas\n", + "import pandas as pd\n", + "\n", + "pd_df = dd_feat.compute()\n", + "\n", + "# 2. Add a 10-day moving average \n", + "pd_df[\"returns_ma_10\"] = (\n", + " pd_df\n", + " .groupby(\"ticker\")[\"returns\"] \n", + " .rolling(window=10, min_periods=1) \n", + " .mean() \n", + " .reset_index(level=0, drop=True) \n", + ")\n", + "\n", + "# 3. check\n", + "print(pd_df[[\"ticker\", \"Date\", \"returns\", \"returns_ma_10\"]].head(15))\n", + "\n", "\n" ] }, @@ -123,7 +321,9 @@ "Please comment:\n", "\n", "+ Was it necessary to convert to pandas to calculate the moving average return?\n", + "- Answer: Not nessary,Dask can supports groupby and rolling, however converting to pandas simplies for working in-memory for small dataseets.\n", "+ Would it have been better to do it in Dask? Why?\n", + "- Answer Yes,rolling calculation helps to scale beyound memory and maintain parallelism.\n", "\n", "(1 pt)" ] @@ -165,7 +365,7 @@ ], "metadata": { "kernelspec": { - "display_name": "env", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -179,7 +379,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.0" + "version": "3.9.15" } }, "nbformat": 4,