From db71c0cdc2f3648faa062e2a1f17fa9c20446d33 Mon Sep 17 00:00:00 2001
From: Patricia Ternes
Date: Mon, 11 Apr 2022 14:45:15 +0100
Subject: [PATCH 1/2] add code for activity model
---
.gitignore | 1 +
01-ActivityModel/activity-model/README.md | 168 +++++++++
.../activity-model/activity_model/__init__.py | 1 +
.../activity-model/activity_model/__main__.py | 28 ++
.../activity_model/data_preparation.py | 350 ++++++++++++++++++
.../activity_model/enriching_population.py | 324 ++++++++++++++++
.../activity-model/config/epc_api.yaml | 25 ++
.../activity-model/config/lad_codes.yaml | 8 +
.../activity-model/config/lookups.yaml | 235 ++++++++++++
.../activity-model/config/psm.yaml | 17 +
.../activity-model/config/spenser.yaml | 6 +
.../activity-model/environment.yml | 21 ++
01-ActivityModel/activity-model/setup.py | 32 ++
.../activity-model/tests/__init__.py | 0
.../tests/test_activity_model.py | 48 +++
15 files changed, 1264 insertions(+)
create mode 100644 01-ActivityModel/activity-model/README.md
create mode 100644 01-ActivityModel/activity-model/activity_model/__init__.py
create mode 100644 01-ActivityModel/activity-model/activity_model/__main__.py
create mode 100644 01-ActivityModel/activity-model/activity_model/data_preparation.py
create mode 100644 01-ActivityModel/activity-model/activity_model/enriching_population.py
create mode 100644 01-ActivityModel/activity-model/config/epc_api.yaml
create mode 100644 01-ActivityModel/activity-model/config/lad_codes.yaml
create mode 100644 01-ActivityModel/activity-model/config/lookups.yaml
create mode 100644 01-ActivityModel/activity-model/config/psm.yaml
create mode 100644 01-ActivityModel/activity-model/config/spenser.yaml
create mode 100644 01-ActivityModel/activity-model/environment.yml
create mode 100644 01-ActivityModel/activity-model/setup.py
create mode 100644 01-ActivityModel/activity-model/tests/__init__.py
create mode 100644 01-ActivityModel/activity-model/tests/test_activity_model.py
diff --git a/.gitignore b/.gitignore
index 30f27b9..38965d1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
.vscode/*
*.ipynb_checkpoints
+*.pytest_cache*
01-ActivityModel/data/*
*all-domestic-certificates.zip
*epc_england.zip
diff --git a/01-ActivityModel/activity-model/README.md b/01-ActivityModel/activity-model/README.md
new file mode 100644
index 0000000..5675d08
--- /dev/null
+++ b/01-ActivityModel/activity-model/README.md
@@ -0,0 +1,168 @@
+# Activity Model
+
+This is the code repository for the Activity Model package.
+
+The Activity Model returns a synthetic population to represent the England
+household population. This model, however, does not create the population from
+scratch, it uses a well-known synthetic population, the SPENSER, and adds the
+following information for each household:
+
+- Accommodation floor area (band)
+- Accommodation age (band)
+- Gas (flag: Y/N)
+
+The accommodation information is originally obtained from Domestic Energy
+Performance Certificates (EPC) and then, codified in this package.
+
+To enrich the SPENSER population with the EPC data, the Propensity Score
+Matching (PSM) method is applied.
+
+The main output is an enriched synthetic population that we use as input for
+energy estimation models developed by the Energy Flexibility Project.
+
+## Environment setup
+
+This package currently supports running on Linux.
+
+To start working with this repository you need to clone it onto your local
+machine:
+
+```bash
+$ git clone https://github.com/anetobradley/energy_flex.git
+$ cd energy_flex/01-ActivityModel
+```
+
+This package requires a specific
+[conda](https://docs.anaconda.com/anaconda/install/) environment.
+You can create an environment for this project using the provided
+environment file:
+
+```bash
+$ conda env create -f environment.yml
+$ conda activate energyflex
+```
+
+## Configuring the model
+
+### Required
+
+#### EPC Credentials
+
+To retrieve data to run the model you will need to have EPC-API credentials.
+You can register [here](https://epc.opendatacommunities.org/#register).
+Next you need to add your credentials into the
+[epc_api](./config/epc_api.yaml) file (you can use your favourite text
+editor for this):
+
+```bash
+$ nano config/user.yaml
+# EPC credentials
+epc_user: "user@email"
+epc_key: "user_key"
+```
+
+#### Local Authority codes
+
+You need provide the code for all Local Authorities that you want a synthetic
+population. Please, insert the values [here](./config/lad_codes.yaml).
+If you not provide any additional value, the default is return the population
+just for Haringey.
+
+You can find
+[here](https://epc.opendatacommunities.org/docs/api/domestic#domestic-local-authority)
+all LAD codes available in the EPC data.
+
+### Optional
+
+#### Year
+
+You can define [here](./config/epc_api.yaml) a different range of the EPC
+lodgement date (the default is 2008-2022).
+
+#### EPC variables
+
+If you want to enrich the synthetic population with more EPC variables you
+need to add them in two lists:
+
+- [epc_api config file](./config/epc_api.yaml) under `epc_headers`.
+- [psm config file](./config/psm.yaml) under `matches_columns`.
+
+You can find a complete EPC Glossary
+[here](https://epc.opendatacommunities.org/docs/guidance#glossary),
+but be aware that there is a difference between the spellings of the terms
+described in this list and how they are used in the API. In our experience the
+differences are:
+
+- capital letters must be written in lowercase letters.
+- underscore must be replaced by a hyphen.
+
+We also warn that most of the information is unencoded, which can make it
+difficult to use (as well as making the output file unnecessarily large).
+The default variables (accommodation floor area, accommodation age, gas)
+are properly encoded and organized by this package.
+
+#### Data url
+
+Three dataset are obtained through urls:
+
+- EPC data
+- SPENSER data
+- Area lookup data
+
+If you want to use different urls, you can change then in:
+
+- EPC url [here](./config/epc_api.yaml) under `epc_url`
+- SPENSER url [here](./config/spenser.yaml) under `spenser_url`
+- Area lookup url [here](./config/lookups.yaml) under `area_url`
+
+Note: You ca obtain data from other places, after all new
+versions are expected, but it is necessary to ensure that the data structure
+is similar or the code will not work.
+
+#### Area granularity
+
+The default granularity is Output Areas, but you can use others, like:
+
+- Lower Layer Super Output Areas (`lsoa11cd`)
+- Middle Layer Super Output Areas (`msoa11cd`)
+- Local authority districts (`ladcd`)
+
+To change this, please use the `area_in_out` variable
+[here](./config/lookups.yaml).
+
+Note that if you change the Area lookup url, the granularities code may also
+change!
+
+## Installation & Usage
+
+Next we install the Activity Model package into the environment using `setup.py`:
+
+```bash
+# for using the code base use
+$ python setup.py install
+```
+
+## Running the model
+
+If you installed the package with the `setup.py` file, to run the model:
+
+```bash
+$ python activity_model
+```
+
+If you did not install the package with the `setup.py` file, you can run the
+code through
+
+```bash
+# for using the code base use
+$ python activity_model/__main__.py
+```
+
+## Outputs
+
+The outputs are stored at `data/output/`. Three outputs are expected:
+
+1. Propensity score distribution images for each local authority.
+2. Internal validation images for each local authority.
+3. Enriched synthetic population for each local authority (CSV file).
+ All CSV files are compressed into a zip file.
diff --git a/01-ActivityModel/activity-model/activity_model/__init__.py b/01-ActivityModel/activity-model/activity_model/__init__.py
new file mode 100644
index 0000000..b794fd4
--- /dev/null
+++ b/01-ActivityModel/activity-model/activity_model/__init__.py
@@ -0,0 +1 @@
+__version__ = '0.1.0'
diff --git a/01-ActivityModel/activity-model/activity_model/__main__.py b/01-ActivityModel/activity-model/activity_model/__main__.py
new file mode 100644
index 0000000..83d09f3
--- /dev/null
+++ b/01-ActivityModel/activity-model/activity_model/__main__.py
@@ -0,0 +1,28 @@
+import yaml
+
+from data_preparation import Epc, Spenser
+from enriching_population import EnrichingPopulation
+
+if __name__ == "__main__":
+ print("hi")
+ spenser = Spenser()
+ epc = Epc()
+ psm = EnrichingPopulation()
+
+ list_df = []
+ list_df_names = []
+ lad_codes_yaml = open("config/lad_codes.yaml")
+ parsed_lad_codes = yaml.load(lad_codes_yaml, Loader=yaml.FullLoader)
+ lad_codes = parsed_lad_codes.get("lad_codes")
+
+ for lad_code in lad_codes:
+ spenser_df = spenser.step(lad_code)
+ epc_df = epc.step(lad_code)
+ rich_df = psm.step(
+ spenser_df, epc_df, lad_code, psm_fig=True, validation_fig=True
+ )
+ list_df_names.append("_".join([lad_code, "hh_msm_epc.csv"]))
+ list_df.append(rich_df)
+
+ psm.save_enriched_pop(list_df_names, list_df)
+
diff --git a/01-ActivityModel/activity-model/activity_model/data_preparation.py b/01-ActivityModel/activity-model/activity_model/data_preparation.py
new file mode 100644
index 0000000..7e96b17
--- /dev/null
+++ b/01-ActivityModel/activity-model/activity_model/data_preparation.py
@@ -0,0 +1,350 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import yaml
+import pandas as pd
+import requests
+import io
+import zipfile
+
+
+class Epc:
+ """Class to represent the SPENSER data and related parameters/methods."""
+
+ def __init__(self) -> None:
+ """Initialise an EPC class."""
+ # Configure epc api related parameters from "config/epc_api.yaml"
+ epc_api_yaml = open("config/epc_api.yaml")
+ parsed_epc_api = yaml.load(epc_api_yaml, Loader=yaml.FullLoader)
+ self.epc_user = parsed_epc_api.get("epc_user")
+ self.epc_key = parsed_epc_api.get("epc_key")
+ self.epc_url = parsed_epc_api.get("epc_url")
+ self.epc_years = parsed_epc_api.get("epc_years")
+ self.desired_headers = parsed_epc_api.get("epc_headers")
+
+ # Using epc api info to build all base url-filter
+ self.epc_filter = self.get_epc_url_filter()
+
+ # Configure lookups
+ ## Lookups from "config/lookups.yaml" file
+ lookup_yaml = open("config/lookups.yaml")
+ parsed_lookup = yaml.load(lookup_yaml, Loader=yaml.FullLoader)
+ self.accommodation_lookup = parsed_lookup.get("accommodation")
+ self.age_categorical_lookup = parsed_lookup.get("age_categorical")
+ self.age_numerical_lookup = parsed_lookup.get("age_numerical")
+ self.floor_area_lookup = parsed_lookup.get("floor_area")
+ self.gas_lookup = parsed_lookup.get("gas")
+ self.tenure_lookup = parsed_lookup.get("tenure")
+ url = parsed_lookup.get("area_url")
+ area_in_out = parsed_lookup.get("area_in_out")
+ area_lookup = pd.read_csv(
+ url,
+ compression="zip",
+ # usecols=[area_in, area_out],
+ usecols=[area_in_out[0], area_in_out[1]],
+ encoding="unicode_escape",
+ engine="python",
+ )
+ self.area_lookup = (
+ area_lookup.set_index(area_in_out[0], drop=True)
+ .loc[:, area_in_out[1]]
+ .to_dict()
+ )
+
+ def get_epc_url_filter(self):
+ """Build a list of EPC search filters urls.
+
+ According to EPC-API
+ [documentation](https://epc.opendatacommunities.org/docs/api/domestic)
+ the API is designed to return up to 10,000 records at a time, with a
+ maximum page size of 5,000. If more than 10,000 records are required,
+ is necessary to vary the search filters and make multiple requests.
+
+ This method returns a list of filter urls to get the maximum possible
+ volume of data. Each filter url covers 4 months.
+
+ :return: EPC-API urls with filters.
+ :rtype: list
+ """
+
+ url_filter = []
+ for i in range(self.epc_years[0], self.epc_years[1], 1):
+ for j in range(3):
+ for k in range(2):
+ search = f"size=5000&from-year={i}&from-month={(j*4)+1}&to-year={i}&to-month={(j+1)*4}&from={k*5000}&local-authority="
+ url_filter.append(self.epc_url + search)
+ return url_filter
+
+ def get_epc_dataframe(self, lad_code) -> pd.DataFrame:
+ """Get EPC data for a given local authority.
+
+ This function uses the EPC-API to get a large amount of data.
+ Due to data limitation per request, several filters are considered.
+
+ Note 1: You need insert a valid EPC user/key (config/epc_api.yaml)
+
+ Note 2: Some data interval return Null value (usually in early 2008) and
+ an exception is used to avoid errors in this case.
+
+ :param lad_code: Local authority code.
+ :type lad_code: string
+ :return: A data frame with all EPC collected data.
+ :rtype: pandas.DataFrame
+ """
+ url_filter = [s + lad_code for s in self.epc_filter]
+ headers = {"Accept": "text/csv"}
+ list_df = []
+ for url in url_filter:
+ try:
+ res = requests.get(
+ url, headers=headers, auth=(self.epc_user, self.epc_key)
+ ).content
+ df = pd.read_csv(
+ io.StringIO(res.decode("utf-8")), usecols=self.desired_headers
+ )
+ list_df.append(df)
+ except pd.errors.EmptyDataError:
+ """
+ Some data interval return Null value (usually in early 2008).
+ This Exception is raised to avoid errors in this situation.
+ Warning: Problems in EPC-API may be difficult to follow.
+ """
+ pass
+ return pd.concat(list_df)
+
+ @staticmethod
+ def remove_duplicates(df):
+ """Remove EPC Duplicate Certificates
+
+ When using the EPC datasets we need to be careful with duplicate EPCs
+ for the same property. While not an enormous issue as an EPC is valid
+ for up to 10 years unless the property is renovated or retrofitted,
+ there may be multiple records especially for rental properties which are
+ improved to meet recent regulations.
+
+ This function removing duplicates with the same BUILDING REFERENCE
+ NUMBER by selecting the most recent record and discarding others.
+
+ :param df: Raw EPC dataset.
+ :type df: pandas.DataFrame
+ :return: EPC dataset without duplicate Certificates.
+ :rtype: pandas.DataFrame
+ """
+ df["lodgement-datetime"] = pd.to_datetime(df["lodgement-datetime"])
+ df = df.sort_values(by=["building-reference-number", "lodgement-datetime"])
+ df.drop_duplicates(
+ subset=["building-reference-number"], keep="last", inplace=True
+ )
+ df.sort_index(inplace=True)
+ df.reset_index(drop=True, inplace=True)
+ drop_list = ["building-reference-number", "lodgement-datetime"]
+ df.drop(drop_list, axis=1, inplace=True)
+ return df
+
+ @staticmethod
+ def set_categorical_code(df, df_col, lookup, rename=False):
+ """ Apply the lookup to a categorical column.
+
+ Transform the values in a dataframe column using a lookup dictionary.
+ This method is valid when the column values are categorical.
+
+ :param df: The input dataframe.
+ :type df: pandas.dataframe
+ :param df_col: The column in df that represents the categorical values.
+ :type df_col: string
+ :param lookup: A dictionary from categorical values to categorical codes.
+ :type lookup: dict
+ :param rename: The new column name after transformation (if false, keep
+ the current name), defaults to False.
+ :type rename: bool, optional
+ :return: Returns the data with the updated column.
+ :rtype: pandas.DataFrame
+ """
+
+ # This looks redundant, but ensures that the function works even for
+ # missing values (returning empty code).
+ def augment(x, lookup):
+ try:
+ return lookup[x]
+ except:
+ return
+
+ # setting new values according the rename_dict
+ df[df_col] = df[df_col].apply(func=lambda x: augment(x, lookup))
+
+ # remove empty rows
+ df.dropna(subset=[df_col], inplace=True)
+
+ # rename column
+ if rename:
+ df.rename({df_col: rename}, axis=1, inplace=True)
+
+ @staticmethod
+ def set_numerical_code(df, df_col, lookup, rename=False):
+ """Apply the lookup to a numerical column
+
+ Transform the values in a dataframe column using a lookup dictionary.
+ This method is valid when the column values are numerical, following
+ the rule:
+
+ if (j < value <= k), then, (value = i).
+
+ :param df: The input dataframe.
+ :type df: pandas.dataframe
+ :param df_col: The column in df that represents the numerical values.
+ :type df_col: string
+ :param lookup: A dictionary from numerical values to numerical codes;
+ The dictionary structure is [[i1, j1, k1], [i2, j2, k2], ...,
+ [iN, jN, kN]], where: iN is the desired code for band N, jN is the
+ minimum value of the band N (not included), kN is the maximum value
+ of the band N (included), and N is the number of bands.
+ :type lookup: dict
+ :param rename: The new column name after transformation (if false, keep
+ the current name), defaults to False.
+ :type rename: bool, optional
+ """
+ for band in lookup:
+ df.loc[(df[df_col] > band[1]) & (df[df_col] <= band[2]), df_col] = band[0]
+
+ # remove out bound and empty rows
+ df.dropna(subset=[df_col], inplace=True)
+
+ if rename:
+ df.rename({df_col: rename}, axis=1, inplace=True)
+
+ def set_lookups(self, df):
+ """Update all columns using the lookups dictionaries.
+
+ Update the information related with area, tenure, accommodation type,
+ construction age band, main gas flag, and floor area, by using the self
+ lookup variables (accommodation_lookup, age_categorical_lookup,
+ age_numerical_lookup, floor_area_lookup, gas_lookup, tenure_lookup,
+ area_lookup) and the set_categorical_code and set_numerical_code
+ functions.
+
+ :param df: Dataframe with EPC information.
+ :type df: pandas.Dataframe
+ """
+ # Area: change area from postcode to output area
+ self.set_categorical_code(df, "postcode", self.area_lookup, rename="Area")
+
+ # Tenure: change the tenure from EPC to SPENSER classification
+ self.set_categorical_code(df, "tenure", self.tenure_lookup)
+
+ # Accommodation type:
+ # - create an EPC accommodation type by combining "property-type" and "built-form"
+ # - change the accommodation type from EPC to SPENSER classification
+ # - discard "property-type" and "built-form" columns
+ df["LC4402_C_TYPACCOM"] = df["property-type"] + ": " + df["built-form"]
+ self.set_categorical_code(df, "LC4402_C_TYPACCOM", self.accommodation_lookup)
+ df.pop("property-type")
+ df.pop("built-form")
+
+ # Construction age band:
+ # - initially is a combination of categorical and numeric values
+ # - convert all categorical values into absolute ages
+ # - groups the absolute build age into bands
+ self.set_categorical_code(
+ df,
+ "construction-age-band",
+ self.age_categorical_lookup,
+ rename="ACCOM_AGE",
+ )
+ df["ACCOM_AGE"] = df["ACCOM_AGE"].apply(pd.to_numeric)
+ self.set_numerical_code(df, "ACCOM_AGE", self.age_numerical_lookup)
+
+ # Main gas flag: change the values (N, Y) to (0, 1)
+ self.set_categorical_code(df, "mains-gas-flag", self.gas_lookup, rename="GAS")
+
+ # Floor Area: groups the absolute area into bands
+ area_max_lim = self.floor_area_lookup[-1][2]
+ df.rename({"total-floor-area": "FLOOR_AREA"}, axis=1, inplace=True)
+ df.drop(df[df.FLOOR_AREA > area_max_lim].index, inplace=True)
+ self.set_numerical_code(df, "FLOOR_AREA", self.floor_area_lookup)
+
+ def step(self, lad_code):
+ """EPC data preparation main step.
+
+ For each given local authority, this functions get the raw EPC data
+ using an API approach and then return a processed EPC dataset.
+
+ :param lad_code: Local authority code.
+ :type lad_code: string
+ :return: processed EPC data
+ :rtype: pandas.DataFrame
+ """
+ # Create EPC dataframe for local authority lad_code
+ df = self.get_epc_dataframe(lad_code)
+
+ df = self.remove_duplicates(df)
+
+ # Apply all lookups
+ self.set_lookups(df)
+
+ # Change selected columns to integer values
+ cols = ["FLOOR_AREA", "ACCOM_AGE", "GAS", "tenure", "LC4402_C_TYPACCOM"]
+ df[cols] = df[cols].applymap(np.int64)
+ return df
+
+
+class Spenser:
+ """Class to represent the SPENSER data and related parameters/methods."""
+
+ def __init__(self) -> None:
+ """Initialise a Spenser class."""
+ # Configure SPENSER related parameters from "config/spenser.yaml"
+ spenser_yaml = open("config/spenser.yaml")
+ parsed_spenser = yaml.load(spenser_yaml, Loader=yaml.FullLoader)
+ spenser_url = parsed_spenser.get("spenser_url")
+ r = requests.get(spenser_url)
+ self.spenser_zip_file = zipfile.ZipFile(io.BytesIO(r.content))
+
+ def set_new_tenure(self, df) -> pd.DataFrame:
+ """Create new temporary tenure column
+
+ This method creates a new tenure column (following EPC values) where
+ the sub-categories
+ - "Owned outright"(=2)
+ - shared ownership" (=3)
+ are merged into a general "Owner-occupied" (=1) category.
+
+ :param df: original SPENSER data frame
+ :type df: pandas.Dataframe
+ :return: SPENSER data frame with a new column
+ :rtype: pandas.DataFrame
+ """
+ df["tenure"] = df["LC4402_C_TENHUK11"].copy()
+ df.loc[(df["tenure"] == 2), "tenure"] = 1
+ df.loc[(df["tenure"] == 3), "tenure"] = 1
+ df["tenure"] = df["tenure"].map(np.int64)
+
+ return df
+
+ def step(self, lad_code):
+ """SPENSER data preparation main step.
+
+ For each given local authority, this functions get the raw EPC data
+ from a zip file and then return a processed SPENSER dataset.
+
+ :param lad_code: Local authority code.
+ :type lad_code: string
+ :return: processed SPENSER data
+ :rtype: pandas.DataFrame
+ """
+
+ # From the zipfile - open the local authority file
+ lad_file = "_".join(["msm_england/ass_hh", lad_code, "OA11_2020.csv"])
+ df = pd.read_csv(self.spenser_zip_file.open(lad_file))
+
+ # Remove "empty" rows: empty codes (here, negative values) are a problem
+ # for PSM method.
+ # TODO: store the "empty" rows in other variable to be possible append
+ # then at the end.
+ df.drop(df[df.LC4402_C_TENHUK11 < 0].index, inplace=True)
+ df.drop(df[df.LC4402_C_TYPACCOM < 0].index, inplace=True)
+
+ # create new tenure
+ df = self.set_new_tenure(df)
+
+ return df
diff --git a/01-ActivityModel/activity-model/activity_model/enriching_population.py b/01-ActivityModel/activity-model/activity_model/enriching_population.py
new file mode 100644
index 0000000..13e89a5
--- /dev/null
+++ b/01-ActivityModel/activity-model/activity_model/enriching_population.py
@@ -0,0 +1,324 @@
+from random import choices
+import zipfile
+from causalinference import CausalModel
+import numpy as np
+import pandas as pd
+from sklearn.neighbors import NearestNeighbors
+import yaml
+import seaborn as sns
+import matplotlib.pyplot as plt
+import os
+from matplotlib.ticker import MaxNLocator
+
+
+class EnrichingPopulation:
+ """Class to enrich a synthetic population.
+
+ To create a enriched synthetic population, this class combines two
+ pandas.DataFrames using the Propensity Score Machting approach.
+ """
+
+ def __init__(self) -> None:
+ """Initialise an EnrichingPopulation class."""
+ # Configure PSM related parameters from "config/psm.yaml"
+ psm_yaml = open("config/psm.yaml")
+ parsed_psm = yaml.load(psm_yaml, Loader=yaml.FullLoader)
+ self.n_neighbors = parsed_psm.get("n_neighbors")
+ self.overlap_columns = parsed_psm.get("overlap_columns")
+ self.matches_columns = parsed_psm.get("matches_columns")
+
+ @staticmethod
+ def set_treatment(df0, df1):
+ """Create a "Treatment" column in each dataframe.
+
+ :param df0: SPENSER data
+ :type df0: pandas.DataFrame
+ :param df1: EPC data
+ :type df1: pandas.DataFrame
+ :return: Two dataframes, first the SPENSER data + new column
+ ("Treatment" = 0), second the EPC data + new column ("Treatment" = 1)
+ :rtype: pandas.DataFrame, pandas.DataFrame
+ """
+
+ df0["Treatment"] = 0
+ df1["Treatment"] = 1
+
+ return df0, df1
+
+ @staticmethod
+ def set_area_factor(df):
+ """Add a new Area column by factorizing the Area codes.
+
+ :param df: Dataset with a Area column.
+ :type df: pandas.DataFrame
+ :return: Input dataset + new factorized Area column
+ :rtype: pandas.DataFrame
+ """
+ Area_factor = df.Area.factorize()
+ df["Area_factor"] = Area_factor[0]
+
+ return df
+
+ @staticmethod
+ def get_propensity_score(df, overlap_columns):
+ """Return the propensity score values.
+
+ :param df: complete dataframe
+ :type df: pandas.DataFrame
+ :param overlap_columns: list of columns names that are present in both
+ datasets (EPC and SPENSER).
+ :type overlap_columns: list
+ :return: list of propensity score for all rows.
+ :rtype: numpy.ndarray
+ """
+
+ ## Isolate the Y, X and the covariates
+ Y = df["Treatment"].copy() # 1-Dimension outcome - arbitrary values
+ X = df["Treatment"].copy() # 1-Dimension treatment
+ C = df[overlap_columns].copy() # n-Dimension covariates
+
+ # Transform pandas dataframe into numpy.ndarray (CausalModel requisite)
+ Y = Y.values
+ X = X.values
+ C = C.values
+
+ # Create the Causal Model
+ model = CausalModel(Y, X, C)
+
+ # Propensity score calculation
+ model.est_propensity_s()
+ return model.propensity["fitted"]
+
+ @staticmethod
+ def get_neighbors(df1, df2, n_neighbors):
+ """For each SPENSER row get a list of EPC rows with the closest propensity score values.
+
+ :param df1: SPENSER dataset
+ :type df1: pandas.DataFrame
+ :param df2: EPC dataset
+ :type df2: pandas.DataFrame
+ :param n_neighbors: Number of neighbors.
+ :type n_neighbors: integer
+ :return: The propensity score difference and the indices of the closest neighbors.
+ :rtype: list, list
+ """
+ # create the neighbors object (p=2 means Euclidean distance)
+ knn = NearestNeighbors(n_neighbors=n_neighbors, p=2).fit(df2[["ps"]])
+
+ # for each household in df1 dataframe, find the nearest df2 neighbors
+ distances, indices = knn.kneighbors(df1[["ps"]])
+ return distances, indices
+
+ @staticmethod
+ def get_matches(distances, indices, n_neighbors):
+ """From the neighbors list get one match for each SPENSER row.
+
+ EPC rows with the same propensity score value have the same probability
+ of being matched with a SPENSER row. The greater the difference between
+ the propensity score values, the lower the probability of being drawn.
+ The weight function used, is a step function.
+
+ :param distances: List of propensity score difference between the closest neighbors.
+ :type distances: list
+ :param indices: List of the closest neighbors indices.
+ :type indices: list
+ :param n_neighbors: Number of neighbors.
+ :type n_neighbors: integer
+ :return: List of assigned pairs.
+ :rtype: list
+ """
+ pairs = []
+ for index1, candidates2 in enumerate(indices):
+ is_zero = np.flatnonzero(distances[index1] == 0)
+ if is_zero.size < n_neighbors:
+ weight = 100 - (distances[index1] / distances[index1][-1] * 95)
+ index2 = choices(candidates2, weights=weight)[0]
+ else:
+ index2 = choices(candidates2)[0]
+ pairs.append([index1, index2])
+
+ return pairs
+
+ @staticmethod
+ def get_enriched_pop(pairs, df1, df2, matches_columns):
+ """Returns the SPENSER enriched population.
+
+ Combine the EPC data with the SPENSER data to generated a enriched
+ synthetic population. To combine the datasets, the propensity score
+ matching method is used.
+
+ :param pairs: List of assigned pairs.
+ :type pairs: list
+ :param df1: SPENSER dataset.
+ :type df1: pandas.DataFrame
+ :param df2: EPC dataset.
+ :type df2: pandas.DataFrame
+ :param matches_columns: List of columns name from EPC to be incorporated
+ into SPENSER dataset.
+ :type matches_columns: list
+ :return: The enriched synthetic population
+ :rtype: pandas.DataFrame
+ """
+ # Add matched df2 index id in df1 dataframe
+ matches = pd.DataFrame(pairs)
+ df1["EPCid"] = matches[1]
+
+ drop_list = [
+ "tenure",
+ "ps",
+ "Treatment",
+ "Area_factor",
+ *matches_columns,
+ ]
+ df1.drop(drop_list, axis=1, inplace=True)
+
+ df2 = df2[matches_columns].copy()
+ df2["EPCid"] = df2.index
+
+ df1 = pd.merge(df1, df2, on="EPCid", how="left")
+ df1.drop(["EPCid"], axis=1, inplace=True)
+
+ return df1
+
+ @staticmethod
+ def save_enriched_pop(list_df_names, list_df):
+ """Save the synthetic population.
+
+ Save the synthetic population into a zip file. Each local authority is
+ stored in a different csv file.
+
+ :param list_df_names: Names of each csv file.
+ :type list_df_names: list
+ :param list_df: List of dataframes.
+ :type list_df: list
+ """
+ if not (os.path.exists("data/output/")):
+ os.makedirs("data/output/")
+
+ # save final population
+ csv_name = os.path.join("data/output/", "msm_epc_england.zip")
+ with zipfile.ZipFile(csv_name, "w") as csv_zip:
+ for i in range(len(list_df_names)):
+ csv_zip.writestr(
+ list_df_names[i], list_df[i].to_csv(index=False, header=True)
+ )
+
+ @staticmethod
+ def save_psm_fig(df, lad_code):
+ """Save the propensity score distribution image.
+
+ This image is a way to visualize the compatibility between the two
+ datasets.
+
+ :param df: SPENSER + EPC merged dataset.
+ :type df: pandas.DataFrame
+ :param lad_code: Local authority district code.
+ :type lad_code: string
+ """
+ if not (os.path.exists("data/output/")):
+ os.makedirs("data/output/")
+
+ sns.set(color_codes=True)
+ # Propensity score comparison plot
+ plt.figure(figsize=(10, 7), dpi=80)
+ sns.kdeplot(df["ps"], hue=df["Treatment"], shade=True)
+
+ # * MSM: `Treatment` = 0
+ # * EPC: `Treatment` = 1
+ plt.xlabel("Propensity Score")
+ fig_name = "_".join([lad_code, "propensity_score.png"])
+ fig_name = os.path.join("data/output/", fig_name)
+ plt.savefig(fig_name)
+ plt.close()
+
+ @staticmethod
+ def save_validation_fig(df1, df2, lad_code):
+ """Save the internal validation image.
+
+ Floor Area distribution and Accommodation age codes distribution
+ comparison between original EPC data and Enriched Synthetic population.
+
+ :param df1: SPENSER dataset.
+ :type df1: pandas.DataFrame
+ :param df2: EPC dataset.
+ :type df2: pandas.DataFrame
+ :param lad_code: Local authority district code.
+ :type lad_code: string
+ """
+ if not (os.path.exists("data/output/")):
+ os.makedirs("data/output/")
+
+ fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(16, 10))
+
+ bins = list(range(1, 20, 1))
+ df2.FLOOR_AREA.plot(kind="hist", ax=ax[0][0], title="EPC", bins=bins)
+ df2.ACCOM_AGE.plot(kind="hist", ax=ax[1][0], title="EPC", bins=10)
+
+ df1.FLOOR_AREA.plot(
+ kind="hist", ax=ax[0][1], title="Enriched SPENSER", bins=bins
+ )
+ df1.ACCOM_AGE.plot(kind="hist", ax=ax[1][1], title="Enriched SPENSER", bins=10)
+
+ ax[0][0].xaxis.set_major_locator(MaxNLocator(integer=True))
+ ax[0][1].xaxis.set_major_locator(MaxNLocator(integer=True))
+
+ ax[0][0].set_xlabel("Floor area code")
+ ax[0][1].set_xlabel("Floor area code")
+
+ ax[1][0].set_xlabel("Accommodation age code")
+ ax[1][1].set_xlabel("Accommodation age code")
+
+ fig.tight_layout(pad=3.0)
+ fig_name = "_".join([lad_code, "validation.png"])
+ fig_name = os.path.join("data/output/", fig_name)
+ plt.savefig(fig_name)
+ plt.close()
+
+ def step(self, df0, df1, lad_code, psm_fig=True, validation_fig=True):
+ """Enriching population main step.
+
+ In this step the EPC data and the SPENSER data are combined to generate
+ an enriched synthetic population for a given local authority.
+
+ :param df0: SPENSER dataset.
+ :type df0: pandas.DataFrame
+ :param df1: EPC dataset.
+ :type df1: pandas.DataFrame
+ :param lad_code: Local authority district code.
+ :type lad_code: string
+ :param psm_fig: Boolean to save the propensity score distribution image, defaults to True.
+ :type psm_fig: bool, optional
+ :param validation_fig: Boolean to save the internal validation image, defaults to True.
+ :type validation_fig: bool, optional
+ :return: Enriched synthetic population
+ :rtype: pandas.DataFrame
+ """
+ df0, df1 = self.set_treatment(df0, df1)
+ dataset = pd.concat([df0, df1], ignore_index=True, sort=False)
+ dataset = self.set_area_factor(dataset)
+ dataset["ps"] = self.get_propensity_score(dataset, self.overlap_columns)
+
+ # TODO instead save the image every step, the image should be stored in
+ # a list. The list should be saved as zip file at the end.
+ if psm_fig:
+ self.save_psm_fig(dataset, lad_code)
+
+ # Separating EPC data from MSM data
+ df0 = dataset.loc[dataset.Treatment == 0].reset_index(drop=True)
+ df1 = dataset.loc[dataset.Treatment == 1].reset_index(drop=True)
+ del dataset
+
+ # Get neighbors and matched pairs
+ distances, indices = self.get_neighbors(df0, df1, self.n_neighbors)
+ pairs = self.get_matches(distances, indices, self.n_neighbors)
+ del distances, indices
+
+ # Get enriched population
+ rich_df = self.get_enriched_pop(pairs, df0, df1, self.matches_columns)
+
+ # TODO instead save the image every step, the image should be stored in
+ # a list. The list should be saved as zip file at the end.
+ if validation_fig:
+ self.save_validation_fig(rich_df, df1, lad_code)
+
+ return rich_df
diff --git a/01-ActivityModel/activity-model/config/epc_api.yaml b/01-ActivityModel/activity-model/config/epc_api.yaml
new file mode 100644
index 0000000..d3e572b
--- /dev/null
+++ b/01-ActivityModel/activity-model/config/epc_api.yaml
@@ -0,0 +1,25 @@
+################################################################################
+# Insert in this file the necessary information to download EPC data using API.
+#
+################################################################################
+
+# EPC credentials
+epc_user: "user@email"
+epc_key: "user_key"
+
+epc_url: "https://epc.opendatacommunities.org/api/v1/domestic/search?"
+
+epc_years:
+ - 2008
+ - 2022
+
+epc_headers:
+ - "postcode"
+ - "property-type"
+ - "built-form"
+ - "construction-age-band"
+ - "tenure"
+ - "total-floor-area"
+ - "mains-gas-flag"
+ - "building-reference-number"
+ - "lodgement-datetime"
diff --git a/01-ActivityModel/activity-model/config/lad_codes.yaml b/01-ActivityModel/activity-model/config/lad_codes.yaml
new file mode 100644
index 0000000..6f7054a
--- /dev/null
+++ b/01-ActivityModel/activity-model/config/lad_codes.yaml
@@ -0,0 +1,8 @@
+################################################################################
+# Insert in this file the local authority codes (lad_codes) of your interest.
+# The default is Haringey.
+#
+################################################################################
+
+lad_codes:
+ - E09000014
\ No newline at end of file
diff --git a/01-ActivityModel/activity-model/config/lookups.yaml b/01-ActivityModel/activity-model/config/lookups.yaml
new file mode 100644
index 0000000..5c0e5eb
--- /dev/null
+++ b/01-ActivityModel/activity-model/config/lookups.yaml
@@ -0,0 +1,235 @@
+################################################################################
+# In this file is listed all lookups used by the Activity Model to organize the
+# EPC data to be merged with the Synthetic Population.
+#
+################################################################################
+# EPC data info:
+# - EPCs issued from January 2008 up to and including 30 June 2021.
+# - 21,440,172 Domestic EPCs
+# - Download: 18 October 2021
+#
+# * Note that new EPCs may include keys not listed here!
+#
+################################################################################
+# EPC Headers:
+# Each file has 90 headers (keys). In this work, the used headers are:
+# - POSTCODE
+# - LOCAL_AUTHORITY
+# - PROPERTY_TYPE + BUILT_FORM = Accommodation type
+# - CONSTRUCTION_AGE_BAND
+# - TENURE
+# - TOTAL_FLOOR_AREA # m²
+# - NUMBER_HABITABLE_ROOMS
+#
+################################################################################
+# Dictionary lookups:
+# keys: based on the values present in the EPC
+# keys_values: based on the values present in the Synthetic Population
+#
+################################################################################
+# List lookups (accommodation age and floor area)
+# values: arbitrary chunks
+#
+# List layout:
+# -
+# - chunk desired code
+# - chunk minimum value (not included)
+# - chunk maximum value (included)
+################################################################################
+
+## Area lookup from: https://geoportal.statistics.gov.uk/
+area_url: "https://www.arcgis.com/sharing/rest/content/items/8a824519215947da99146692b0a0ff49/data"
+
+area_in_out:
+ - "pcds"
+ - "oa11cd"
+
+accommodation:
+ 'House: Detached': 2
+ 'Bungalow: Detached': 2
+ 'House: Semi-Detached': 3
+ 'Bungalow: Semi-Detached': 3
+ 'House: Mid-Terrace': 4
+ 'House: End-Terrace': 4
+ 'House: Enclosed Mid-Terrace': 4
+ 'House: Enclosed End-Terrace': 4
+ 'Bungalow: Mid-Terrace': 4
+ 'Bungalow: End-Terrace': 4
+ 'Bungalow: Enclosed Mid-Terrace': 4
+ 'Bungalow: Enclosed End-Terrace': 4
+ 'Flat: NO DATA!': 5
+ 'Flat: Detached': 5
+ 'Flat: Semi-Detached': 5
+ 'Flat: Mid-Terrace': 5
+ 'Flat: End-Terrace': 5
+ 'Flat: Enclosed Mid-Terrace': 5
+ 'Flat: Enclosed End-Terrace': 5
+ 'Maisonette: NO DATA!': 5
+ 'Maisonette: Detached': 5
+ 'Maisonette: Semi-Detached': 5
+ 'Maisonette: Mid-Terrace': 5
+ 'Maisonette: End-Terrace': 5
+ 'Maisonette: Enclosed Mid-Terrace': 5
+ 'Maisonette: Enclosed End-Terrace': 5
+ 'Park home: Detached': 5
+ 'Park home: Semi-Detached': 5
+ 'House: NO DATA!': null
+ 'Bungalow: NO DATA!': null
+
+age_categorical:
+ 'England and Wales: before 1900': 1
+ 'England and Wales: 1900-1929': 1
+ 'England and Wales: 1930-1949': 1930
+ 'England and Wales: 1950-1966': 1950
+ 'England and Wales: 1967-1975': 1967
+ 'England and Wales: 1976-1982': 1976
+ 'England and Wales: 1983-1990': 1983
+ 'England and Wales: 1991-1995': 1991
+ 'England and Wales: 1996-2002': 1996
+ 'England and Wales: 2003-2006': 2003
+ 'England and Wales: 2007-2011': 2007
+ 'England and Wales: 2007 onwards': 2007
+ 'England and Wales: 2012 onwards': 2007
+ INVALID!: null
+ Not applicable: null
+ NO DATA!: null
+
+
+gas:
+ N: 0
+ Y: 1
+
+tenure:
+ Owner-occupied: 1
+ owner-occupied: 1
+ rental (social): 5
+ Rented (social): 5
+ rental (private): 6
+ Rented (private): 6
+ Not defined - use in the case of a new dwelling for which the intended tenure in not known. It is no: null
+ unknown: null
+ NO DATA!: null
+
+age_numerical:
+ -
+ - 1
+ - 0
+ - 1929
+ -
+ - 2
+ - 1929
+ - 1949
+ -
+ - 3
+ - 1949
+ - 1966
+ -
+ - 4
+ - 1966
+ - 1975
+ -
+ - 5
+ - 1975
+ - 1982
+ -
+ - 6
+ - 1982
+ - 1990
+ -
+ - 7
+ - 1990
+ - 1995
+ -
+ - 8
+ - 1995
+ - 2002
+ -
+ - 9
+ - 2002
+ - 2006
+ -
+ - 10
+ - 2006
+ - 3000
+
+floor_area:
+ -
+ - 1
+ - 0
+ - 25
+ -
+ - 2
+ - 25
+ - 50
+ -
+ - 3
+ - 50
+ - 75
+ -
+ - 4
+ - 75
+ - 100
+ -
+ - 5
+ - 100
+ - 125
+ -
+ - 6
+ - 125
+ - 150
+ -
+ - 7
+ - 150
+ - 175
+ -
+ - 8
+ - 175
+ - 200
+ -
+ - 9
+ - 200
+ - 225
+ -
+ - 10
+ - 225
+ - 250
+ -
+ - 11
+ - 250
+ - 275
+ -
+ - 12
+ - 275
+ - 300
+ -
+ - 13
+ - 300
+ - 325
+ -
+ - 14
+ - 325
+ - 350
+ -
+ - 15
+ - 350
+ - 375
+ -
+ - 16
+ - 375
+ - 400
+ -
+ - 17
+ - 400
+ - 425
+ -
+ - 18
+ - 425
+ - 450
+ -
+ - 19
+ - 450
+ - 475
+ -
+ - 20
+ - 475
+ - 500
diff --git a/01-ActivityModel/activity-model/config/psm.yaml b/01-ActivityModel/activity-model/config/psm.yaml
new file mode 100644
index 0000000..7520d71
--- /dev/null
+++ b/01-ActivityModel/activity-model/config/psm.yaml
@@ -0,0 +1,17 @@
+################################################################################
+# Insert in this file the necessary information to configure the
+# EnrichingPopulation class.
+#
+################################################################################
+
+n_neighbors: 200
+
+overlap_columns:
+ - "LC4402_C_TYPACCOM"
+ - "tenure"
+ - "Area_factor"
+
+matches_columns:
+ - "FLOOR_AREA"
+ - "GAS"
+ - "ACCOM_AGE"
diff --git a/01-ActivityModel/activity-model/config/spenser.yaml b/01-ActivityModel/activity-model/config/spenser.yaml
new file mode 100644
index 0000000..a191483
--- /dev/null
+++ b/01-ActivityModel/activity-model/config/spenser.yaml
@@ -0,0 +1,6 @@
+################################################################################
+# Insert in this file the necessary information to download SPENSER data.
+#
+################################################################################
+
+spenser_url: "https://osf.io/623qz/download"
\ No newline at end of file
diff --git a/01-ActivityModel/activity-model/environment.yml b/01-ActivityModel/activity-model/environment.yml
new file mode 100644
index 0000000..7175bb4
--- /dev/null
+++ b/01-ActivityModel/activity-model/environment.yml
@@ -0,0 +1,21 @@
+name: energyflex1
+channels:
+ - conda-forge
+ - defaults
+dependencies:
+ - matplotlib=3.5.1
+ - numpy=1.21.2
+ - pandas=1.4.1
+ - python=3.9.7
+ - pip=21.2.4
+ - pyyaml=6.0
+ - requests=2.27.1
+ - scikit-learn=1.0.2
+ - seaborn=0.11.2
+ - setuptools=58.0.4
+ - pytest=6.2.5
+ - black=19.10b0
+ - sphinx=4.4.0
+ - sphinx_rtd_theme=0.4.3
+ - pip:
+ - causalinference==0.1.3
diff --git a/01-ActivityModel/activity-model/setup.py b/01-ActivityModel/activity-model/setup.py
new file mode 100644
index 0000000..86456ec
--- /dev/null
+++ b/01-ActivityModel/activity-model/setup.py
@@ -0,0 +1,32 @@
+import setuptools
+
+with open("README.md", "r") as fh:
+ long_description = fh.read()
+
+setuptools.setup(
+ name="activity-model",
+ version="0.1.0",
+ author="Patricia Ternes",
+ author_email="p.ternesdallagnollo@leeds.ac.uk",
+ description="The Activity Model package",
+ long_description=long_description,
+ long_description_content_type="text/markdown",
+ # url="#",
+ # install_requires=[
+ # "numpy=1.21.2",
+ # "pandas=1.4.1",
+ # "requests=2.27.1",
+ # "causalinference==0.1.3",
+ # "scikit-learn=1.0.2",
+ # "seaborn=0.11.2",
+ # "matplotlib=3.5.1",
+ # "pyyaml=6.0",
+ # ]
+ packages=setuptools.find_packages(),
+ classifiers=[
+ "Development Status :: 3 - Alpha",
+ "Programming Language :: Python :: 3.9",
+ "Intended Audience :: Science/Research",
+ ],
+ python_requires=">=3.9",
+)
diff --git a/01-ActivityModel/activity-model/tests/__init__.py b/01-ActivityModel/activity-model/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/01-ActivityModel/activity-model/tests/test_activity_model.py b/01-ActivityModel/activity-model/tests/test_activity_model.py
new file mode 100644
index 0000000..6d0a755
--- /dev/null
+++ b/01-ActivityModel/activity-model/tests/test_activity_model.py
@@ -0,0 +1,48 @@
+from activity_model import __version__
+from activity_model.data_preparation import Epc
+
+import requests
+import pytest
+
+
+def test_version():
+ assert __version__ == "0.1.0"
+
+
+@pytest.fixture
+def epc():
+ epc = Epc()
+ return epc
+
+
+def test_lookup_type(epc):
+ assert type(epc.accommodation_lookup) is dict
+ assert type(epc.age_categorical_lookup) is dict
+ assert type(epc.gas_lookup) is dict
+ assert type(epc.tenure_lookup) is dict
+ assert type(epc.age_numerical_lookup) is list
+ assert type(epc.floor_area_lookup) is list
+ assert type(epc.area_lookup) is dict
+
+
+def test_epc_connection(epc):
+ url = epc.epc_url
+ user = epc.epc_user
+ key = epc.epc_key
+ headers = {"Accept": "text/csv"}
+
+ r = requests.head(url, headers=headers, auth=(user, key))
+ assert (
+ r.status_code == 200
+ ), "Please check your EPC credentials here: config/epc_api.yaml"
+
+
+# test area lookup connection?
+# test spenser connection?
+
+# test if area column has the right values
+# test if floor area has the right values
+# test if age has the right values
+# test if gas has the right values
+# test if tenure has the right values
+# test if accommodation type has the right values
From d5946698941a86c06ac5444754648b94efadaf12 Mon Sep 17 00:00:00 2001
From: Patricia Ternes
Date: Mon, 11 Apr 2022 14:47:59 +0100
Subject: [PATCH 2/2] fix path
---
01-ActivityModel/activity-model/README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/01-ActivityModel/activity-model/README.md b/01-ActivityModel/activity-model/README.md
index 5675d08..f6cea5b 100644
--- a/01-ActivityModel/activity-model/README.md
+++ b/01-ActivityModel/activity-model/README.md
@@ -29,7 +29,7 @@ machine:
```bash
$ git clone https://github.com/anetobradley/energy_flex.git
-$ cd energy_flex/01-ActivityModel
+$ cd energy_flex/01-ActivityModel/activity-model/
```
This package requires a specific