datacommonsorg · abhishekjaisw · Jan 27, 2026 · Jan 27, 2026 · Jan 28, 2026 · gemini-code-assist
diff --git a/statvar_imports/statistics_poland/README.md b/statvar_imports/statistics_poland/README.md
@@ -0,0 +1,43 @@
+# Poland Demographics Dataset
+## Overview
+This dataset contains demographic information from Poland sourced directly from poland datasets for foundational demographic and socio-economic statistics for Poland.
+
+## Data Source
+
+**Source URL:** 
+https://stat.gov.pl/en/databases/
+
+
+The data comes from Poland's official statistical authority and includes comprehensive demographic variables such as population counts, age distributions, and other census-related metrics.
+Processing Instructions
+
+## how to download data
+Download script (download_script.py). To download the data, you'll need to use the provided download script,download_script.py. This script will automatically create an "poland_input" folder where you should place the file to be processed. The script also requires a poland_data_sample/poland_raw.xlsx to be present to identify file structure.
+
+type of place: State.
+
+statvars: Demographics
+
+years: 2003 to 2024.
+
+## Processing Instructions
+To process the Poland Census data and generate statistical variables, use the following command from the "data" directory:
+
+Example Download : python3 statistics_poland/download_script.py
+
+## For Test Data Run:
+python3 tools/statvar_importer/stat_var_processor.py \
+  --input_data=statvar_imports/statistics_poland/test/StatisticsPoland_input.csv \
+  --pv_map=statvar_imports/statistics_poland/StatisticsPoland_pvmap.csv \
+  --output_path=statvar_imports/statistics_poland/test/StatisticsPoland_output \
+  --config_file=statvar_imports/statistics_poland/Statistics_Poland_metadata.csv \
+  --existing_statvar_mcf=gs://unresolved_mcf/scripts/statvar/stat_vars.mcf \
+  2>&1 | tee statvar_imports/statistics_poland/log.txt
+
+## For Main data run
+python3 tools/statvar_importer/stat_var_processor.py \
+  --input_data=statvar_imports/statistics_poland/poland_input/StatisticsPoland_input.csv \
+  --pv_map=statvar_imports/statistics_poland/StatisticsPoland_pvmap.csv \
+  --output_path=statvar_imports/statistics_poland/poland_output/StatisticsPoland_output \
+  --config_file=statvar_imports/statistics_poland/Statistics_Poland_metadata.csv \
+  --existing_statvar_mcf=gs://unresolved_mcf/scripts/statvar/stat_vars.mcf
diff --git a/statvar_imports/statistics_poland/StatisticsPoland_pvmap.csv b/statvar_imports/statistics_poland/StatisticsPoland_pvmap.csv
@@ -0,0 +1,68 @@
+key,p1,v1,p2,v2,p3,v3,p4,v4,p5,v5
+,,,,,,,,,,
+Code,measuredProperty,count,populationType,Person,,,,,,
+males,gender,Male,,,,,,,,
+females,gender,Female,,,,,,,,
+total,,,,,,,,,,
+in urban areas,placeOfResidenceClassification,Urban,,,,,,,,
+in rural areas,placeOfResidenceClassification,Rural,,,,,,,,
+
+0-2,age,Years0To2,,,,,,,,
+3-6,age,Years3To6,,,,,,,,
+7-12,age,Years7To12,,,,,,,,
+13-15,age,Years13To15,,,,,,,,
+16-19,age,Years16To19,,,,,,,,
+20-24,age,Years20To24,,,,,,,,
+25-34,age,Years25To34,,,,,,,,
+35-44,age,Years35To44,,,,,,,,
+45-54,age,Years45To54,,,,,,,,
+55-64,age,Years55To64,,,,,,,,
+65 and more,age,Years65Onwards,,,,,,,,
+,,,,,,,,,,
+,,,,,,,,,,
+POLAND,observationAbout,country/POL,#Header,observationAbout,,,,,,
+DOLNOŚLĄSKIE,observationAbout,wikidataId/Q54150,#Header,observationAbout,,,,,,
+KUJAWSKO-POMORSKIE,observationAbout,nuts/PL61,#Header,observationAbout,,,,,,
+LUBELSKIE,observationAbout,wikidataId/Q54155,#Header,observationAbout,,,,,,
+LUBUSKIE,observationAbout,wikidataId/Q54157,#Header,observationAbout,,,,,,
+ŁÓDZKIE,observationAbout,nuts/PL71,#Header,observationAbout,,,,,,
+MAŁOPOLSKIE,observationAbout,nuts/PL213,#Header,observationAbout,,,,,,
+MAZOWIECKIE,observationAbout,wikidataId/Q54169,#Header,observationAbout,,,,,,
+OPOLSKIE,observationAbout,wikidataId/Q54171,#Header,observationAbout,,,,,,
+PODKARPACKIE,observationAbout,wikidataId/Q54175,#Header,observationAbout,,,,,,
+PODLASKIE,observationAbout,wikidataId/Q54177,#Header,observationAbout,,,,,,
+POMORSKIE,observationAbout,wikidataId/Q1288480,#Header,observationAbout,,,,,,
+ŚLĄSKIE,observationAbout,wikidataId/Q588,#Header,observationAbout,,,,,,
+ŚWIĘTOKRZYSKIE,observationAbout,nuts/PL72,#Header,observationAbout,,,,,,
+WARMIŃSKO-MAZURSKIE,observationAbout,wikidataId/Q54184,#Header,observationAbout,,,,,,
+WIELKOPOLSKIE,observationAbout,wikidataId/Q54187,#Header,observationAbout,,,,,,
+ZACHODNIOPOMORSKIE,observationAbout,wikidataId/Q54188,#Header,observationAbout,,,,,,
+,,,,,,,,,,
+2003,observationDate,2003,value,{Number},,,,,,
+2004,observationDate,2004,value,{Number},,,,,,
+2005,observationDate,2005,value,{Number},,,,,,
+2006,observationDate,2006,value,{Number},,,,,,
+2007,observationDate,2007,value,{Number},,,,,,
+2008,observationDate,2008,value,{Number},,,,,,
+2009,observationDate,2009,value,{Number},,,,,,
+2010,observationDate,2010,value,{Number},,,,,,
+2011,observationDate,2011,value,{Number},,,,,,
+2012,observationDate,2012,value,{Number},,,,,,
+2013,observationDate,2013,value,{Number},,,,,,
+2014,observationDate,2014,value,{Number},,,,,,
+2015,observationDate,2015,value,{Number},,,,,,
+2016,observationDate,2016,value,{Number},,,,,,
+2017,observationDate,2017,value,{Number},,,,,,
+2018,observationDate,2018,value,{Number},,,,,,
+2019,observationDate,2019,value,{Number},,,,,,
+2020,observationDate,2020,value,{Number},,,,,,
+2021,observationDate,2021,value,{Number},,,,,,
+2022,observationDate,2022,value,{Number},,,,,,
+2023,observationDate,2023,value,{Number},,,,,,
+2024,observationDate,2024,value,{Number},,,,,,
+2025,observationDate,2025,value,{Number},,,,,,
+2026,observationDate,2026,value,{Number},,,,,,
+2027,observationDate,2027,value,{Number},,,,,,
+2028,observationDate,2028,value,{Number},,,,,,
+2029,observationDate,2029,value,{Number},,,,,,
+2030,observationDate,2030,value,{Number},,,,,,
diff --git a/statvar_imports/statistics_poland/Statistics_Poland_metadata.csv b/statvar_imports/statistics_poland/Statistics_Poland_metadata.csv
@@ -0,0 +1,13 @@
+config,value
+provenance_url,https://bdl.stat.gov.pl/bdl/dane/podgrup/tablica
+output_columns,"observationDate,observationAbout,value,variableMeasured"
+places_within,country/POL
+#place_types,"AdministrativeArea,AdministrativeArea1,AdministrativeArea2,State"
+#debug,1
+#input_rows,100
+#word_delimiter,''
+#skip_rows,1
+header_rows,5
+mapped_columns,2
+dc_api_root,https://api.datacommons.org
+
diff --git a/statvar_imports/statistics_poland/download_script.py b/statvar_imports/statistics_poland/download_script.py
@@ -0,0 +1,81 @@
+import pandas as pd
+import os
+from datetime import datetime
+
+# Configuration
+INPUT_FILE = "statvar_imports/statistics_poland/poland_data_sample/poland_raw.xlsx"
+# Final path for Data Commons import
+OUTPUT_DIR = "statvar_imports/statistics_poland/poland_input"
+OUTPUT_FILE = os.path.join(OUTPUT_DIR, "StatisticsPoland_input.csv")
+
+# Target functional age groups
+TARGET_AGES = [
+    "0-2", "3-6", "7-12", "13-15", "16-19", "20-24", 
+    "25-34", "35-44", "45-54", "55-64", "65 i więcej"
+]
+
+def process_poland_pivot():
+    if not os.path.exists(INPUT_FILE):
+        print(f"ERROR: {INPUT_FILE} not found.")
+        return
+
+    print(f"Starting generic processing. Saving to: {OUTPUT_FILE}")
+
+    try:
+        # 1. Load the 'DANE' sheet
+        df = pd.read_excel(INPUT_FILE, sheet_name='DANE')
+        df.columns = ['Code', 'Name', 'Age', 'Sex', 'Location', 'Year', 'Value', 'Unit', 'Attr']
+
+        # 2. Generic Filtering
+        # Keep only specified age groups
+        df = df[df['Age'].isin(TARGET_AGES)]
+
+        # DYNAMIC YEAR LOGIC:
+        # Detects all years in the file and filters out any accidental future projections
+        current_year = datetime.now().year
+        available_years = sorted([y for y in df['Year'].unique() if y <= current_year])
+        df = df[df['Year'].isin(available_years)]
+
+        # 3. Translation Dictionary
+        translations = {
+            'mężczyźni': 'males',
+            'kobiety': 'females',
+            'ogółem': 'total',
+            'w miastach': 'in urban areas',
+            'na wsi': 'in rural areas',
+            'POLSKA': 'POLAND',
+            '65 i więcej': '65 and more'
+        }
+
+        df['Sex'] = df['Sex'].replace(translations)
+        df['Location'] = df['Location'].replace(translations)
+        df['Name'] = df['Name'].replace(translations)
+        df['Age'] = df['Age'].replace(translations)
-        df['Sex'] = df['Sex'].replace(translations)
-        df['Location'] = df['Location'].replace(translations)
-        df['Name'] = df['Name'].replace(translations)
-        df['Age'] = df['Age'].replace(translations)
+        for col in ['Sex', 'Location', 'Name', 'Age']:
+            df[col] = df[col].replace(translations)
-        df['Sex'] = df['Sex'].replace(translations)
-        df['Location'] = df['Location'].replace(translations)
-        df['Name'] = df['Name'].replace(translations)
-        df['Age'] = df['Age'].replace(translations)
+        for col in ['Sex', 'Location', 'Name', 'Age']:
+            df[col] = df[col].replace(translations)
+
+        # 4. Create the Pivot Table
+        # Stacks categories into a multi-level header: Age > Sex > Location > Year
+        pivot_df = df.pivot_table(
+            index=['Code', 'Name'], 
+            columns=['Age', 'Sex', 'Location', 'Year'], 
+            values='Value'
+        )
+
+        # 5. Format Geographic Codes (ensuring 7-digit padding)
+        pivot_df.index = pivot_df.index.set_levels(
+            pivot_df.index.levels[0].astype(str).str.zfill(7), level=0
+        )
+
+        # 6. Save result
+        os.makedirs(OUTPUT_DIR, exist_ok=True)
+        # utf-8-sig ensures Polish special characters in 'Name' stay readable
+        pivot_df.to_csv(OUTPUT_FILE, encoding='utf-8-sig')
+
+        print(f"SUCCESS: {OUTPUT_FILE} has been updated.")
+        print(f"Years Included: {available_years}")
+        print(f"Total Geographies Processed: {pivot_df.shape[0]}")
+
+    except Exception as e:
+        print(f"Processing Error: {e}")
+
+if __name__ == "__main__":
+    process_poland_pivot()
diff --git a/statvar_imports/statistics_poland/manifest.json b/statvar_imports/statistics_poland/manifest.json
@@ -0,0 +1,26 @@
+{
+    "import_specifications": [
+        {
+            "import_name": "statistics_poland",
+            "curator_emails": [
+                "support@datacommons.org"
+            ],
+            "provenance_url": "https://stat.gov.pl/en/databases/",
+            "provenance_description": "Population data for demographic variables such as population counts, age distributions, and other census-related metrics in Poland",
+            "scripts": [
+                "download_script.py",
+                "../../tools/statvar_importer/stat_var_processor.py --input_data=poland_input/StatisticsPoland_input.csv --pv_map=StatisticsPoland_pvmap.csv --config_file=Statistics_Poland_metadata.csv --output_path=poland_output/StatisticsPoland_output"
+            ],
+            "source_files": [
+                "poland_input/StatisticsPoland_input.csv"
+            ],
+            "import_inputs": [
+                {
+                    "template_mcf": "poland_output/StatisticsPoland_output.tmcf",
+                    "cleaned_csv": "poland_output/StatisticsPoland_output.csv"
+                }
+            ],
+            "cron_schedule": "0 0 1 1,4,7,10 *"
+        }
+    ]
+}