Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
228 changes: 214 additions & 14 deletions 02_activities/assignments/assignment_1.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -26,19 +26,30 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# Write your code below.\n",
"\n"
"\n",
"%load_ext dotenv\n",
"%dotenv "
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\shail\\.conda\\envs\\dsi_participant\\lib\\site-packages\\dask\\dataframe\\_pyarrow_compat.py:15: FutureWarning: Minimal version of pyarrow will soon be increased to 14.0.1. You are using 11.0.0. Please consider upgrading.\n",
" warnings.warn(\n"
]
}
],
"source": [
"import dask.dataframe as dd"
]
Expand All @@ -55,14 +66,101 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found 2897 total parquet files under ../../05_src/data/prices/\n",
"Using first 60 → sample_files list length: 60\n",
" ../../05_src/data/prices\\ACHV\\ACHV_1995\\part.0.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_1995\\part.1.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_1996\\part.0.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_1996\\part.1.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_1997\\part.0.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_1997\\part.1.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_1998\\part.0.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_1998\\part.1.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_1999\\part.0.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_1999\\part.1.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2000\\part.0.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2000\\part.1.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2001\\part.0.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2001\\part.1.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2002\\part.0.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2002\\part.1.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2003\\part.0.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2003\\part.1.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2004\\part.0.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2004\\part.1.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2005\\part.0.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2005\\part.1.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2006\\part.0.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2006\\part.1.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2007\\part.0.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2007\\part.1.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2008\\part.0.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2008\\part.1.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2009\\part.0.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2009\\part.1.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2010\\part.0.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2010\\part.1.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2011\\part.0.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2011\\part.1.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2012\\part.0.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2012\\part.1.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2013\\part.0.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2013\\part.1.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2014\\part.0.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2014\\part.1.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2015\\part.0.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2015\\part.1.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2016\\part.0.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2016\\part.1.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2017\\part.0.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2017\\part.1.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2018\\part.0.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2018\\part.1.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2019\\part.0.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2019\\part.1.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2020\\part.0.parquet\n",
" ../../05_src/data/prices\\ACHV\\ACHV_2020\\part.1.parquet\n",
" ../../05_src/data/prices\\ACN\\ACN_2001\\part.0.parquet\n",
" ../../05_src/data/prices\\ACN\\ACN_2001\\part.1.parquet\n",
" ../../05_src/data/prices\\ACN\\ACN_2002\\part.0.parquet\n",
" ../../05_src/data/prices\\ACN\\ACN_2002\\part.1.parquet\n",
" ../../05_src/data/prices\\ACN\\ACN_2003\\part.0.parquet\n",
" ../../05_src/data/prices\\ACN\\ACN_2003\\part.1.parquet\n",
" ../../05_src/data/prices\\ACN\\ACN_2004\\part.0.parquet\n",
" ../../05_src/data/prices\\ACN\\ACN_2004\\part.1.parquet\n"
]
}
],
"source": [
"\n",
"import os\n",
"from glob import glob\n",
"\n",
"# Write your code below.\n",
"# PRICE_DATA directory \n",
"price_data_dir = os.getenv(\"PRICE_DATA\")\n",
"if not price_data_dir:\n",
" raise RuntimeError(\"PRICE_DATA not set. Did you run %dotenv?\")\n",
"\n",
"#Glob for every .parquet (recursively)\n",
"all_files = glob(os.path.join(price_data_dir, \"**\", \"*.parquet\"), recursive=True)\n",
"\n",
"#Take only the first 60\n",
"sample_files = all_files[:60]\n",
"\n",
"#check\n",
"print(f\"Found {len(all_files)} total parquet files under {price_data_dir}\")\n",
"print(f\"Using first 60 → sample_files list length: {len(sample_files)}\")\n",
"for path in sample_files:\n",
" print(\" \", path)\n",
"\n",
"\n",
"\n"
]
},
Expand All @@ -88,11 +186,72 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\shail\\AppData\\Local\\Temp\\ipykernel_6524\\3770542123.py:12: UserWarning: `meta` is not specified, inferred from partial data. Please provide `meta` if the result is unexpected.\n",
" Before: .apply(func)\n",
" After: .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result\n",
" or: .apply(func, meta=('x', 'f8')) for series result\n",
" Close_lag_1 = lambda df: df.groupby(\"ticker\")[\"Close\"].shift(1),\n",
"C:\\Users\\shail\\AppData\\Local\\Temp\\ipykernel_6524\\3770542123.py:13: UserWarning: `meta` is not specified, inferred from partial data. Please provide `meta` if the result is unexpected.\n",
" Before: .apply(func)\n",
" After: .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result\n",
" or: .apply(func, meta=('x', 'f8')) for series result\n",
" Adj_Close_lag_1 = lambda df: df.groupby(\"ticker\")[\"Adj_Close\"].shift(1),\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" Date Open High Low Close Adj_Close Volume source \\\n",
"0 1995-10-12 14850.0 14850.0 13365.0 13612.5 13612.5 100 ACHV.csv \n",
"1 1995-10-13 14355.0 14355.0 13365.0 14107.5 14107.5 0 ACHV.csv \n",
"2 1995-10-16 14355.0 14355.0 13365.0 14355.0 14355.0 0 ACHV.csv \n",
"3 1995-10-17 13612.5 14355.0 13365.0 13860.0 13860.0 0 ACHV.csv \n",
"4 1995-10-18 14107.5 14850.0 13365.0 14107.5 14107.5 0 ACHV.csv \n",
"\n",
" ticker Year Close_lag_1 Adj_Close_lag_1 returns hi_lo_range \n",
"0 ACHV 1995 NaN NaN NaN 1485.0 \n",
"1 ACHV 1995 NaN NaN NaN 990.0 \n",
"2 ACHV 1995 NaN NaN NaN 990.0 \n",
"3 ACHV 1995 NaN NaN NaN 990.0 \n",
"4 ACHV 1995 NaN NaN NaN 1485.0 \n"
]
}
],
"source": [
"# Write your code below.\n",
"\n",
"\n",
"# 1. Read first 60 files\n",
"ddf = dd.read_parquet(sample_files, engine=\"pyarrow\")\n",
"\n",
"# 2. Rename “Adj Close” to a Python‐friendly name\n",
"ddf = ddf.rename(columns={\"Adj Close\": \"Adj_Close\"})\n",
"\n",
"# 3. Build your lag, returns and hi_lo_range features\n",
"dd_feat = (\n",
" ddf\n",
" .assign(\n",
" Close_lag_1 = lambda df: df.groupby(\"ticker\")[\"Close\"].shift(1),\n",
" Adj_Close_lag_1 = lambda df: df.groupby(\"ticker\")[\"Adj_Close\"].shift(1),\n",
" )\n",
" .assign(\n",
" returns = lambda df: (df[\"Close\"] / df[\"Close_lag_1\"]) - 1,\n",
" hi_lo_range = lambda df: df[\"High\"] - df[\"Low\"],\n",
" )\n",
" .persist()\n",
")\n",
"\n",
"# 4. Inspect\n",
"print(dd_feat.head())\n",
"\n",
"\n",
"\n"
]
},
Expand All @@ -108,11 +267,50 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": null,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ticker Date returns returns_ma_10\n",
"0 ACHV 1995-10-12 NaN NaN\n",
"1 ACHV 1995-10-13 NaN NaN\n",
"2 ACHV 1995-10-16 NaN NaN\n",
"3 ACHV 1995-10-17 NaN NaN\n",
"4 ACHV 1995-10-18 NaN NaN\n",
"5 ACHV 1995-10-19 NaN NaN\n",
"6 ACHV 1995-10-20 NaN NaN\n",
"7 ACHV 1995-10-23 NaN NaN\n",
"8 ACHV 1995-10-24 NaN NaN\n",
"9 ACHV 1995-10-25 NaN NaN\n",
"10 ACHV 1995-10-26 NaN NaN\n",
"11 ACHV 1995-10-27 NaN NaN\n",
"12 ACHV 1995-10-30 NaN NaN\n",
"13 ACHV 1995-10-31 NaN NaN\n",
"14 ACHV 1995-11-01 NaN NaN\n"
]
}
],
"source": [
"# Write your code below.\n",
"# 1. Compute the Dask DataFrame into pandas\n",
"import pandas as pd\n",
"\n",
"pd_df = dd_feat.compute()\n",
"\n",
"# 2. Add a 10-day moving average \n",
"pd_df[\"returns_ma_10\"] = (\n",
" pd_df\n",
" .groupby(\"ticker\")[\"returns\"] \n",
" .rolling(window=10, min_periods=1) \n",
" .mean() \n",
" .reset_index(level=0, drop=True) \n",
")\n",
"\n",
"# 3. check\n",
"print(pd_df[[\"ticker\", \"Date\", \"returns\", \"returns_ma_10\"]].head(15))\n",
"\n",
"\n"
]
},
Expand All @@ -123,7 +321,9 @@
"Please comment:\n",
"\n",
"+ Was it necessary to convert to pandas to calculate the moving average return?\n",
"- Answer: Not nessary,Dask can supports groupby and rolling, however converting to pandas simplies for working in-memory for small dataseets.\n",
"+ Would it have been better to do it in Dask? Why?\n",
"- Answer Yes,rolling calculation helps to scale beyound memory and maintain parallelism.\n",
"\n",
"(1 pt)"
]
Expand Down Expand Up @@ -165,7 +365,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "env",
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
Expand All @@ -179,7 +379,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.0"
"version": "3.9.15"
}
},
"nbformat": 4,
Expand Down