From a8a927b36532e93126096a04740a61b810b7d69b Mon Sep 17 00:00:00 2001 From: "Charles T. Gray" Date: Sun, 28 Sep 2025 15:58:03 +0200 Subject: [PATCH] feat/close #11 data refactored to update and load --- button_2/classes/data/__init__.py | 0 button_2/classes/data/button_dat.py | 8 ++ button_2/classes/data/data_updater.py | 14 +++ button_2/classes/data/gs_scraper.py | 121 +++++++++++++++++++++++++ button_2/data/edges.csv | 14 +++ button_2/data/employee.csv | 6 ++ button_2/data/nodes.csv | 12 +++ button_2/data/text.csv | 5 + button_2/tests/data/__init__.py | 0 button_2/tests/data/test_button_dat.py | 17 ++++ button_2/tests/data/test_gs_scraper.py | 11 +++ button_2/update_data.py | 4 + 12 files changed, 212 insertions(+) create mode 100644 button_2/classes/data/__init__.py create mode 100644 button_2/classes/data/button_dat.py create mode 100644 button_2/classes/data/data_updater.py create mode 100644 button_2/classes/data/gs_scraper.py create mode 100644 button_2/data/edges.csv create mode 100644 button_2/data/employee.csv create mode 100644 button_2/data/nodes.csv create mode 100644 button_2/data/text.csv create mode 100644 button_2/tests/data/__init__.py create mode 100644 button_2/tests/data/test_button_dat.py create mode 100644 button_2/tests/data/test_gs_scraper.py create mode 100644 button_2/update_data.py diff --git a/button_2/classes/data/__init__.py b/button_2/classes/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/button_2/classes/data/button_dat.py b/button_2/classes/data/button_dat.py new file mode 100644 index 0000000..6312996 --- /dev/null +++ b/button_2/classes/data/button_dat.py @@ -0,0 +1,8 @@ +import pandas as pd + +class ButtonDat: + def __init__(self): + self.edges_df = pd.read_csv('button_2/data/edges.csv') + self.nodes_df = pd.read_csv('button_2/data/nodes.csv') + self.text_df = pd.read_csv('button_2/data/text.csv') + self.employee_df = pd.read_csv('button_2/data/employee.csv') diff --git a/button_2/classes/data/data_updater.py b/button_2/classes/data/data_updater.py new file mode 100644 index 0000000..4940c46 --- /dev/null +++ b/button_2/classes/data/data_updater.py @@ -0,0 +1,14 @@ +from classes.data.gs_scraper import GsScraper + +class DataUpdater: + def __init__(self): + self.edges = GsScraper('edges') + self.nodes = GsScraper('nodes') + self.text = GsScraper('text') + self.employee = GsScraper('employee') + + def update_all(self): + self.edges.df.to_csv('button_2/data/edges.csv', index=False) + self.nodes.df.to_csv('button_2/data/nodes.csv', index=False) + self.text.df.to_csv('button_2/data/text.csv', index=False) + self.employee.df.to_csv('button_2/data/employee.csv', index=False) \ No newline at end of file diff --git a/button_2/classes/data/gs_scraper.py b/button_2/classes/data/gs_scraper.py new file mode 100644 index 0000000..c35d287 --- /dev/null +++ b/button_2/classes/data/gs_scraper.py @@ -0,0 +1,121 @@ +""" +GsScraper - Google Sheets Data Connector with Retry Logic +========================================================= + +This module provides robust Google Sheets integration for the text-based +adventure game, with automatic retry logic and rate limiting to handle +API restrictions and network issues gracefully. + +The GsScraper class serves as the primary data connector, converting Google +Sheets tabs into pandas DataFrames while handling common failure modes +like rate limiting, network timeouts, and temporary service unavailability. + +Classes: + GsScraper: Google Sheets scraper with exponential backoff retry logic + +Environment Requirements: + - BUTTON_SHEET_ID: Main Google Sheets document identifier + - BUTTON_SHEET_EDGES_GID: GID for edges data tab + - BUTTON_SHEET_NODES_GID: GID for nodes data tab + - BUTTON_SHEET_TEXT_GID: GID for text content tab + - BUTTON_SHEET_EMPLOYEE_GID: GID for employee data tab + +Key Features: + - Automatic retry with exponential backoff + - Environment-specific .env file loading + - Comprehensive error handling and reporting + - Rate limiting protection for API compliance +""" + +import pandas as pd +import os +from dotenv import load_dotenv +import time +from urllib.error import HTTPError +from pathlib import Path + + +# Load .env file from the button_2 directory (not root) +button_2_dir = Path(__file__).parent.parent.parent # Go up from classes/data to button_2/ +env_path = button_2_dir / '.env' +load_dotenv(env_path) # Load environment variables from button_2/.env + +# Google Sheets CSV export URL format: +# https://docs.google.com/spreadsheets/d/[SHEET_ID]/export?format=csv&gid=[SHEET_GID] + +class GsScraper: + """ + Google Sheets data connector with robust error handling and retry logic. + + GsScraper provides reliable access to Google Sheets data by implementing + exponential backoff retry logic and comprehensive error handling. It + automatically handles rate limiting, network issues, and temporary + service unavailability while providing clear error messages for + configuration problems. + + The class supports multiple sheet tabs within a single Google Sheets + document, using environment variables to map sheet names to their + specific GID identifiers. + + Attributes: + df (pd.DataFrame): Loaded data from the specified sheet + sheet_name (str): Name of the sheet tab being accessed + + Supported Sheet Names: + - 'edges': State transition definitions + - 'nodes': Node content and configuration + - 'text': Reusable text snippets + - 'titles': Job title variations (future feature) + + Environment Variables Required: + BUTTON_SHEET_ID: Main Google Sheets document ID + BUTTON_SHEET_*_GID: Individual tab GID for each sheet type + """ + + def __init__(self, sheet_name): + """ + Initialize Google Sheets connection for specified tab. + + Sets up the connection parameters and loads data from the specified + Google Sheets tab using the appropriate GID from environment variables. + + Args: + sheet_name (str): Name of sheet tab to load + Must be one of: 'edges', 'nodes', 'text', 'titles' + + Raises: + ValueError: If sheet_name is not recognized + EnvironmentError: If required environment variables are missing + ConnectionError: If data cannot be loaded after all retries + """ + # Load sheet configuration from environment variables + self._sheet_id = os.getenv("BUTTON_SHEET_ID") + if sheet_name == 'edges': + self._sheet_gid = os.getenv("BUTTON_SHEET_EDGES_GID") + elif sheet_name == 'nodes': + self._sheet_gid = os.getenv("BUTTON_SHEET_NODES_GID") + elif sheet_name == 'text': + self._sheet_gid = os.getenv("BUTTON_SHEET_TEXT_GID") + elif sheet_name == 'employee': + self._sheet_gid = os.getenv("BUTTON_SHEET_EMPLOYEE_GID") + else: + raise ValueError(f"Unknown sheet name: {sheet_name}. Must be one of: edges, nodes, text, employee") + self.data_url = f"https://docs.google.com/spreadsheets/d/{self._sheet_id}/export?format=csv&gid={self._sheet_gid}" + + # Load data with retry logic for rate limiting + self.df = self._load_data_with_retry() + + def _load_data_with_retry(self, max_retries=3, delay=1): + """Load CSV data with retry logic for rate limiting""" + for attempt in range(max_retries): + try: + return pd.read_csv(self.data_url) + except HTTPError as e: + if e.code == 404 and attempt < max_retries - 1: + print(f"⚠️ Rate limited, retrying in {delay} seconds... (attempt {attempt + 1})") + time.sleep(delay) + delay *= 2 # Exponential backoff + else: + raise e + except Exception as e: + raise e \ No newline at end of file diff --git a/button_2/data/edges.csv b/button_2/data/edges.csv new file mode 100644 index 0000000..f75cdd2 --- /dev/null +++ b/button_2/data/edges.csv @@ -0,0 +1,14 @@ +source,target,outro_text,desired +start_game,welcome,You begin your career in data.,True +welcome,onboarding,You proceed to onboarding.,True +onboarding,initiate_project,Well done for completing onboarding. Your button-pressing mastery qualifies you to begin consulting with TechCo stakeholders! ,True +initiate_project,source_data,"With a question defined by the stakeholder, you are ready to source the data. Leadership promise there will be cake after the analysis presentation.",True +source_data,transform_data,You find clearly-marked tables with interpretable column names.,True +transform_data,analyse_data,You architect a clean data lineage that pulls the sources together into datasets ready for analysis.,True +analyse_data,report_analytics,You build dashboards that answer the question the stakeholder framed.,True +report_analytics,decision_maker,You present your findings with flair in the boardroom to the stakeholders.,True +report_analytics,source_data,"During your presentation one of the stakeholders points out there's no integration of a data source that was never mentioned until now. You will need to go back to scouring through the names of databases in the datawarehouse to find the needle in the haystack desired column required, it likely doesn't have a very informative column name.",False +decision_maker,initiate_project,"When the analysis reaches leadership, they state it doesn't answer the question posed. An abortive blame game erupts over whether you misunderstood the question, your manager miscommunicated ir, or whether leadership failed to make their actual needs known. The upshot it is, you begin the analysis again.",False +analyse_data,transform_data,"Your analysis shows duplicates and missingness, empty bar plots, weird-looking scatter plots. Time to dive back into the data lineage spaghetti and figure out what went wrong.",False +decision_maker,end,Leadership declare the results satisfactory but there is a sense that this only sort-of answered the questions they had. There is no cake. You switch your professional profile to availabe for hire.,True +end,summary,,True diff --git a/button_2/data/employee.csv b/button_2/data/employee.csv new file mode 100644 index 0000000..d9f3934 --- /dev/null +++ b/button_2/data/employee.csv @@ -0,0 +1,6 @@ +job_title,department +data scientist,data +data analyst,data analytics +data engineer,business intelligence +analytics engineer,data & analytics +business intelligence analyst,data science diff --git a/button_2/data/nodes.csv b/button_2/data/nodes.csv new file mode 100644 index 0000000..7cba040 --- /dev/null +++ b/button_2/data/nodes.csv @@ -0,0 +1,12 @@ +node,edge_selector,title_text,intro_text,event_text,pbn +start_game,start,PRESS A BUTTON NOW,Explore the lived experience of working in data.,,to enter the world of data +welcome,auto,Welcome!,Welcome employee #eiLaSi-LV-426-ekaCehT to your first day at TechCo! You are a data scientist. It is your job to press buttons. ,,to proceed to onboarding +onboarding,auto,Onboarding,Welcome employee #eiLaSi-LV-426-ekaCehT to onboarding at TechCo! You are a data scientist. It is your job to press buttons. ,,to complete onboarding +initiate_project,auto,A question that needs answering,"An executive at TechCo has a question that needs answering. You are told the data can be found in the data warehouse, no problem. Once you are done with the anlaysis, there will be cake.",,to consult stakeholders about their evidence-based needs +source_data,auto,Source the data,"The data warehouse is home to hundreds of databases. Each database is home to dozens of tables, each of which has hundreds of columns. There are billions of rows of data.",,to dive into the warehouse and live in matrix-scroll land of scrutinising seemingly endless columns and rows +transform_data,auto,Shape the data,"Now that you have figured out which data sources you need to extract from, you are ready to transform the data into analytic tables that will form the foundation of your shiny analyses. You are sure looking forward to that cake.",,to architect a data lineage +analyse_data,random,Analyse the data,"Finally, the fun part! Making pretty pictures out of data to answering questions in meaningful ways. Spurred on by the promise of cake, you lose yourself in the finesse of choosing colour palettes and developing visualisations that answer the multi-faceted question that was posed. You work late into the night.",,to build interactive data visualisations +report_analytics,random,Report your findings,"The big day has arrived! The boardroom is filled with middle-managers somewhat awfully assembled around croissants and tepid coffee. No cake yet, but getting ever closer.",,to report your findings +decision_maker,random,Decision maker assessment,You receive a calendar invite from your manager saying they have met with leadership and want to discuss how the analysis was received. There is no mention of cake.,,to meet with your manager about the outcome of the analysis and whether you will get cake +end,end,End game,"The project ends in a whimper, just a meeting with your manager. There was never going to be any cake. You resignedly flicked you professional profile to ""available for work"".",,to reflect on your time at TechCo +summary,end,Your TechCo experience,,, diff --git a/button_2/data/text.csv b/button_2/data/text.csv new file mode 100644 index 0000000..2ae518c --- /dev/null +++ b/button_2/data/text.csv @@ -0,0 +1,5 @@ +id_text,text_type,text_context,text +pbn,generic,input,Press a button now +edge_good,edge,edge_selector,This step went as expected. +edge_bad,edge,edge_selector,"Oh, no! Something isn't quite right." +employee_id,employee,, diff --git a/button_2/tests/data/__init__.py b/button_2/tests/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/button_2/tests/data/test_button_dat.py b/button_2/tests/data/test_button_dat.py new file mode 100644 index 0000000..34aa6ce --- /dev/null +++ b/button_2/tests/data/test_button_dat.py @@ -0,0 +1,17 @@ +from button_2.classes.data.button_dat import ButtonDat + +def test_button_dat_initialisation(): + """Test that ButtonDat initializes correctly with both datasets""" + game_data = ButtonDat() + + # Test that both DataFrames exist + assert hasattr(game_data, 'edges_df'), "ButtonDat should have edges_df attribute" + assert hasattr(game_data, 'nodes_df'), "ButtonDat should have nodes_df attribute" + assert hasattr(game_data, 'text_df'), "ButtonDat should have text_df attribute" + assert hasattr(game_data, 'employee_df'), "ButtonDat should have employee_df attribute" + + # Test DataFrames are not empty + assert not game_data.edges_df.empty, "Edges DataFrame should not be empty" + assert not game_data.nodes_df.empty, "Nodes DataFrame should not be empty" + assert not game_data.text_df.empty, "Text DataFrame should not be empty" + assert not game_data.employee_df.empty, "Employee DataFrame should not be empty" \ No newline at end of file diff --git a/button_2/tests/data/test_gs_scraper.py b/button_2/tests/data/test_gs_scraper.py new file mode 100644 index 0000000..760a7f5 --- /dev/null +++ b/button_2/tests/data/test_gs_scraper.py @@ -0,0 +1,11 @@ +from button_2.classes.data.gs_scraper import GsScraper + +def test_gs_scraper_edges_non_empty(): + gs_scraper = GsScraper('edges') + assert gs_scraper.df is not None, "GsScraper.df should not be None after initialization, sheet name likely misspecified or env vars missing" + assert not gs_scraper.df.empty, "GsScraper.df should not be empty after initialization, sheet name likely misspecified or env vars missing" + +def test_gs_scraper_nodes_non_empty(): + gs_scraper = GsScraper('nodes') + assert gs_scraper.df is not None, "GsScraper.df should not be None after initialization, sheet name likely misspecified or env vars missing" + assert not gs_scraper.df.empty, "GsScraper.df should not be empty after initialization, sheet name likely misspecified or env vars missing" diff --git a/button_2/update_data.py b/button_2/update_data.py new file mode 100644 index 0000000..3f9e1aa --- /dev/null +++ b/button_2/update_data.py @@ -0,0 +1,4 @@ +from classes.data.data_updater import DataUpdater + +dat = DataUpdater() +dat.update_all() \ No newline at end of file