diff --git a/.gitignore b/.gitignore index 30f27b9..38965d1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ .vscode/* *.ipynb_checkpoints +*.pytest_cache* 01-ActivityModel/data/* *all-domestic-certificates.zip *epc_england.zip diff --git a/01-ActivityModel/activity-model/README.md b/01-ActivityModel/activity-model/README.md new file mode 100644 index 0000000..f6cea5b --- /dev/null +++ b/01-ActivityModel/activity-model/README.md @@ -0,0 +1,168 @@ +# Activity Model + +This is the code repository for the Activity Model package. + +The Activity Model returns a synthetic population to represent the England +household population. This model, however, does not create the population from +scratch, it uses a well-known synthetic population, the SPENSER, and adds the +following information for each household: + +- Accommodation floor area (band) +- Accommodation age (band) +- Gas (flag: Y/N) + +The accommodation information is originally obtained from Domestic Energy +Performance Certificates (EPC) and then, codified in this package. + +To enrich the SPENSER population with the EPC data, the Propensity Score +Matching (PSM) method is applied. + +The main output is an enriched synthetic population that we use as input for +energy estimation models developed by the Energy Flexibility Project. + +## Environment setup + +This package currently supports running on Linux. + +To start working with this repository you need to clone it onto your local +machine: + +```bash +$ git clone https://github.com/anetobradley/energy_flex.git +$ cd energy_flex/01-ActivityModel/activity-model/ +``` + +This package requires a specific +[conda](https://docs.anaconda.com/anaconda/install/) environment. +You can create an environment for this project using the provided +environment file: + +```bash +$ conda env create -f environment.yml +$ conda activate energyflex +``` + +## Configuring the model + +### Required + +#### EPC Credentials + +To retrieve data to run the model you will need to have EPC-API credentials. +You can register [here](https://epc.opendatacommunities.org/#register). +Next you need to add your credentials into the +[epc_api](./config/epc_api.yaml) file (you can use your favourite text +editor for this): + +```bash +$ nano config/user.yaml +# EPC credentials +epc_user: "user@email" +epc_key: "user_key" +``` + +#### Local Authority codes + +You need provide the code for all Local Authorities that you want a synthetic +population. Please, insert the values [here](./config/lad_codes.yaml). +If you not provide any additional value, the default is return the population +just for Haringey. + +You can find +[here](https://epc.opendatacommunities.org/docs/api/domestic#domestic-local-authority) +all LAD codes available in the EPC data. + +### Optional + +#### Year + +You can define [here](./config/epc_api.yaml) a different range of the EPC +lodgement date (the default is 2008-2022). + +#### EPC variables + +If you want to enrich the synthetic population with more EPC variables you +need to add them in two lists: + +- [epc_api config file](./config/epc_api.yaml) under `epc_headers`. +- [psm config file](./config/psm.yaml) under `matches_columns`. + +You can find a complete EPC Glossary +[here](https://epc.opendatacommunities.org/docs/guidance#glossary), +but be aware that there is a difference between the spellings of the terms +described in this list and how they are used in the API. In our experience the +differences are: + +- capital letters must be written in lowercase letters. +- underscore must be replaced by a hyphen. + +We also warn that most of the information is unencoded, which can make it +difficult to use (as well as making the output file unnecessarily large). +The default variables (accommodation floor area, accommodation age, gas) +are properly encoded and organized by this package. + +#### Data url + +Three dataset are obtained through urls: + +- EPC data +- SPENSER data +- Area lookup data + +If you want to use different urls, you can change then in: + +- EPC url [here](./config/epc_api.yaml) under `epc_url` +- SPENSER url [here](./config/spenser.yaml) under `spenser_url` +- Area lookup url [here](./config/lookups.yaml) under `area_url` + +Note: You ca obtain data from other places, after all new +versions are expected, but it is necessary to ensure that the data structure +is similar or the code will not work. + +#### Area granularity + +The default granularity is Output Areas, but you can use others, like: + +- Lower Layer Super Output Areas (`lsoa11cd`) +- Middle Layer Super Output Areas (`msoa11cd`) +- Local authority districts (`ladcd`) + +To change this, please use the `area_in_out` variable +[here](./config/lookups.yaml). + +Note that if you change the Area lookup url, the granularities code may also +change! + +## Installation & Usage + +Next we install the Activity Model package into the environment using `setup.py`: + +```bash +# for using the code base use +$ python setup.py install +``` + +## Running the model + +If you installed the package with the `setup.py` file, to run the model: + +```bash +$ python activity_model +``` + +If you did not install the package with the `setup.py` file, you can run the +code through + +```bash +# for using the code base use +$ python activity_model/__main__.py +``` + +## Outputs + +The outputs are stored at `data/output/`. Three outputs are expected: + +1. Propensity score distribution images for each local authority. +2. Internal validation images for each local authority. +3. Enriched synthetic population for each local authority (CSV file). + All CSV files are compressed into a zip file. diff --git a/01-ActivityModel/activity-model/activity_model/__init__.py b/01-ActivityModel/activity-model/activity_model/__init__.py new file mode 100644 index 0000000..b794fd4 --- /dev/null +++ b/01-ActivityModel/activity-model/activity_model/__init__.py @@ -0,0 +1 @@ +__version__ = '0.1.0' diff --git a/01-ActivityModel/activity-model/activity_model/__main__.py b/01-ActivityModel/activity-model/activity_model/__main__.py new file mode 100644 index 0000000..83d09f3 --- /dev/null +++ b/01-ActivityModel/activity-model/activity_model/__main__.py @@ -0,0 +1,28 @@ +import yaml + +from data_preparation import Epc, Spenser +from enriching_population import EnrichingPopulation + +if __name__ == "__main__": + print("hi") + spenser = Spenser() + epc = Epc() + psm = EnrichingPopulation() + + list_df = [] + list_df_names = [] + lad_codes_yaml = open("config/lad_codes.yaml") + parsed_lad_codes = yaml.load(lad_codes_yaml, Loader=yaml.FullLoader) + lad_codes = parsed_lad_codes.get("lad_codes") + + for lad_code in lad_codes: + spenser_df = spenser.step(lad_code) + epc_df = epc.step(lad_code) + rich_df = psm.step( + spenser_df, epc_df, lad_code, psm_fig=True, validation_fig=True + ) + list_df_names.append("_".join([lad_code, "hh_msm_epc.csv"])) + list_df.append(rich_df) + + psm.save_enriched_pop(list_df_names, list_df) + diff --git a/01-ActivityModel/activity-model/activity_model/data_preparation.py b/01-ActivityModel/activity-model/activity_model/data_preparation.py new file mode 100644 index 0000000..7e96b17 --- /dev/null +++ b/01-ActivityModel/activity-model/activity_model/data_preparation.py @@ -0,0 +1,350 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import numpy as np +import yaml +import pandas as pd +import requests +import io +import zipfile + + +class Epc: + """Class to represent the SPENSER data and related parameters/methods.""" + + def __init__(self) -> None: + """Initialise an EPC class.""" + # Configure epc api related parameters from "config/epc_api.yaml" + epc_api_yaml = open("config/epc_api.yaml") + parsed_epc_api = yaml.load(epc_api_yaml, Loader=yaml.FullLoader) + self.epc_user = parsed_epc_api.get("epc_user") + self.epc_key = parsed_epc_api.get("epc_key") + self.epc_url = parsed_epc_api.get("epc_url") + self.epc_years = parsed_epc_api.get("epc_years") + self.desired_headers = parsed_epc_api.get("epc_headers") + + # Using epc api info to build all base url-filter + self.epc_filter = self.get_epc_url_filter() + + # Configure lookups + ## Lookups from "config/lookups.yaml" file + lookup_yaml = open("config/lookups.yaml") + parsed_lookup = yaml.load(lookup_yaml, Loader=yaml.FullLoader) + self.accommodation_lookup = parsed_lookup.get("accommodation") + self.age_categorical_lookup = parsed_lookup.get("age_categorical") + self.age_numerical_lookup = parsed_lookup.get("age_numerical") + self.floor_area_lookup = parsed_lookup.get("floor_area") + self.gas_lookup = parsed_lookup.get("gas") + self.tenure_lookup = parsed_lookup.get("tenure") + url = parsed_lookup.get("area_url") + area_in_out = parsed_lookup.get("area_in_out") + area_lookup = pd.read_csv( + url, + compression="zip", + # usecols=[area_in, area_out], + usecols=[area_in_out[0], area_in_out[1]], + encoding="unicode_escape", + engine="python", + ) + self.area_lookup = ( + area_lookup.set_index(area_in_out[0], drop=True) + .loc[:, area_in_out[1]] + .to_dict() + ) + + def get_epc_url_filter(self): + """Build a list of EPC search filters urls. + + According to EPC-API + [documentation](https://epc.opendatacommunities.org/docs/api/domestic) + the API is designed to return up to 10,000 records at a time, with a + maximum page size of 5,000. If more than 10,000 records are required, + is necessary to vary the search filters and make multiple requests. + + This method returns a list of filter urls to get the maximum possible + volume of data. Each filter url covers 4 months. + + :return: EPC-API urls with filters. + :rtype: list + """ + + url_filter = [] + for i in range(self.epc_years[0], self.epc_years[1], 1): + for j in range(3): + for k in range(2): + search = f"size=5000&from-year={i}&from-month={(j*4)+1}&to-year={i}&to-month={(j+1)*4}&from={k*5000}&local-authority=" + url_filter.append(self.epc_url + search) + return url_filter + + def get_epc_dataframe(self, lad_code) -> pd.DataFrame: + """Get EPC data for a given local authority. + + This function uses the EPC-API to get a large amount of data. + Due to data limitation per request, several filters are considered. + + Note 1: You need insert a valid EPC user/key (config/epc_api.yaml) + + Note 2: Some data interval return Null value (usually in early 2008) and + an exception is used to avoid errors in this case. + + :param lad_code: Local authority code. + :type lad_code: string + :return: A data frame with all EPC collected data. + :rtype: pandas.DataFrame + """ + url_filter = [s + lad_code for s in self.epc_filter] + headers = {"Accept": "text/csv"} + list_df = [] + for url in url_filter: + try: + res = requests.get( + url, headers=headers, auth=(self.epc_user, self.epc_key) + ).content + df = pd.read_csv( + io.StringIO(res.decode("utf-8")), usecols=self.desired_headers + ) + list_df.append(df) + except pd.errors.EmptyDataError: + """ + Some data interval return Null value (usually in early 2008). + This Exception is raised to avoid errors in this situation. + Warning: Problems in EPC-API may be difficult to follow. + """ + pass + return pd.concat(list_df) + + @staticmethod + def remove_duplicates(df): + """Remove EPC Duplicate Certificates + + When using the EPC datasets we need to be careful with duplicate EPCs + for the same property. While not an enormous issue as an EPC is valid + for up to 10 years unless the property is renovated or retrofitted, + there may be multiple records especially for rental properties which are + improved to meet recent regulations. + + This function removing duplicates with the same BUILDING REFERENCE + NUMBER by selecting the most recent record and discarding others. + + :param df: Raw EPC dataset. + :type df: pandas.DataFrame + :return: EPC dataset without duplicate Certificates. + :rtype: pandas.DataFrame + """ + df["lodgement-datetime"] = pd.to_datetime(df["lodgement-datetime"]) + df = df.sort_values(by=["building-reference-number", "lodgement-datetime"]) + df.drop_duplicates( + subset=["building-reference-number"], keep="last", inplace=True + ) + df.sort_index(inplace=True) + df.reset_index(drop=True, inplace=True) + drop_list = ["building-reference-number", "lodgement-datetime"] + df.drop(drop_list, axis=1, inplace=True) + return df + + @staticmethod + def set_categorical_code(df, df_col, lookup, rename=False): + """ Apply the lookup to a categorical column. + + Transform the values in a dataframe column using a lookup dictionary. + This method is valid when the column values are categorical. + + :param df: The input dataframe. + :type df: pandas.dataframe + :param df_col: The column in df that represents the categorical values. + :type df_col: string + :param lookup: A dictionary from categorical values to categorical codes. + :type lookup: dict + :param rename: The new column name after transformation (if false, keep + the current name), defaults to False. + :type rename: bool, optional + :return: Returns the data with the updated column. + :rtype: pandas.DataFrame + """ + + # This looks redundant, but ensures that the function works even for + # missing values (returning empty code). + def augment(x, lookup): + try: + return lookup[x] + except: + return + + # setting new values according the rename_dict + df[df_col] = df[df_col].apply(func=lambda x: augment(x, lookup)) + + # remove empty rows + df.dropna(subset=[df_col], inplace=True) + + # rename column + if rename: + df.rename({df_col: rename}, axis=1, inplace=True) + + @staticmethod + def set_numerical_code(df, df_col, lookup, rename=False): + """Apply the lookup to a numerical column + + Transform the values in a dataframe column using a lookup dictionary. + This method is valid when the column values are numerical, following + the rule: + + if (j < value <= k), then, (value = i). + + :param df: The input dataframe. + :type df: pandas.dataframe + :param df_col: The column in df that represents the numerical values. + :type df_col: string + :param lookup: A dictionary from numerical values to numerical codes; + The dictionary structure is [[i1, j1, k1], [i2, j2, k2], ..., + [iN, jN, kN]], where: iN is the desired code for band N, jN is the + minimum value of the band N (not included), kN is the maximum value + of the band N (included), and N is the number of bands. + :type lookup: dict + :param rename: The new column name after transformation (if false, keep + the current name), defaults to False. + :type rename: bool, optional + """ + for band in lookup: + df.loc[(df[df_col] > band[1]) & (df[df_col] <= band[2]), df_col] = band[0] + + # remove out bound and empty rows + df.dropna(subset=[df_col], inplace=True) + + if rename: + df.rename({df_col: rename}, axis=1, inplace=True) + + def set_lookups(self, df): + """Update all columns using the lookups dictionaries. + + Update the information related with area, tenure, accommodation type, + construction age band, main gas flag, and floor area, by using the self + lookup variables (accommodation_lookup, age_categorical_lookup, + age_numerical_lookup, floor_area_lookup, gas_lookup, tenure_lookup, + area_lookup) and the set_categorical_code and set_numerical_code + functions. + + :param df: Dataframe with EPC information. + :type df: pandas.Dataframe + """ + # Area: change area from postcode to output area + self.set_categorical_code(df, "postcode", self.area_lookup, rename="Area") + + # Tenure: change the tenure from EPC to SPENSER classification + self.set_categorical_code(df, "tenure", self.tenure_lookup) + + # Accommodation type: + # - create an EPC accommodation type by combining "property-type" and "built-form" + # - change the accommodation type from EPC to SPENSER classification + # - discard "property-type" and "built-form" columns + df["LC4402_C_TYPACCOM"] = df["property-type"] + ": " + df["built-form"] + self.set_categorical_code(df, "LC4402_C_TYPACCOM", self.accommodation_lookup) + df.pop("property-type") + df.pop("built-form") + + # Construction age band: + # - initially is a combination of categorical and numeric values + # - convert all categorical values into absolute ages + # - groups the absolute build age into bands + self.set_categorical_code( + df, + "construction-age-band", + self.age_categorical_lookup, + rename="ACCOM_AGE", + ) + df["ACCOM_AGE"] = df["ACCOM_AGE"].apply(pd.to_numeric) + self.set_numerical_code(df, "ACCOM_AGE", self.age_numerical_lookup) + + # Main gas flag: change the values (N, Y) to (0, 1) + self.set_categorical_code(df, "mains-gas-flag", self.gas_lookup, rename="GAS") + + # Floor Area: groups the absolute area into bands + area_max_lim = self.floor_area_lookup[-1][2] + df.rename({"total-floor-area": "FLOOR_AREA"}, axis=1, inplace=True) + df.drop(df[df.FLOOR_AREA > area_max_lim].index, inplace=True) + self.set_numerical_code(df, "FLOOR_AREA", self.floor_area_lookup) + + def step(self, lad_code): + """EPC data preparation main step. + + For each given local authority, this functions get the raw EPC data + using an API approach and then return a processed EPC dataset. + + :param lad_code: Local authority code. + :type lad_code: string + :return: processed EPC data + :rtype: pandas.DataFrame + """ + # Create EPC dataframe for local authority lad_code + df = self.get_epc_dataframe(lad_code) + + df = self.remove_duplicates(df) + + # Apply all lookups + self.set_lookups(df) + + # Change selected columns to integer values + cols = ["FLOOR_AREA", "ACCOM_AGE", "GAS", "tenure", "LC4402_C_TYPACCOM"] + df[cols] = df[cols].applymap(np.int64) + return df + + +class Spenser: + """Class to represent the SPENSER data and related parameters/methods.""" + + def __init__(self) -> None: + """Initialise a Spenser class.""" + # Configure SPENSER related parameters from "config/spenser.yaml" + spenser_yaml = open("config/spenser.yaml") + parsed_spenser = yaml.load(spenser_yaml, Loader=yaml.FullLoader) + spenser_url = parsed_spenser.get("spenser_url") + r = requests.get(spenser_url) + self.spenser_zip_file = zipfile.ZipFile(io.BytesIO(r.content)) + + def set_new_tenure(self, df) -> pd.DataFrame: + """Create new temporary tenure column + + This method creates a new tenure column (following EPC values) where + the sub-categories + - "Owned outright"(=2) + - shared ownership" (=3) + are merged into a general "Owner-occupied" (=1) category. + + :param df: original SPENSER data frame + :type df: pandas.Dataframe + :return: SPENSER data frame with a new column + :rtype: pandas.DataFrame + """ + df["tenure"] = df["LC4402_C_TENHUK11"].copy() + df.loc[(df["tenure"] == 2), "tenure"] = 1 + df.loc[(df["tenure"] == 3), "tenure"] = 1 + df["tenure"] = df["tenure"].map(np.int64) + + return df + + def step(self, lad_code): + """SPENSER data preparation main step. + + For each given local authority, this functions get the raw EPC data + from a zip file and then return a processed SPENSER dataset. + + :param lad_code: Local authority code. + :type lad_code: string + :return: processed SPENSER data + :rtype: pandas.DataFrame + """ + + # From the zipfile - open the local authority file + lad_file = "_".join(["msm_england/ass_hh", lad_code, "OA11_2020.csv"]) + df = pd.read_csv(self.spenser_zip_file.open(lad_file)) + + # Remove "empty" rows: empty codes (here, negative values) are a problem + # for PSM method. + # TODO: store the "empty" rows in other variable to be possible append + # then at the end. + df.drop(df[df.LC4402_C_TENHUK11 < 0].index, inplace=True) + df.drop(df[df.LC4402_C_TYPACCOM < 0].index, inplace=True) + + # create new tenure + df = self.set_new_tenure(df) + + return df diff --git a/01-ActivityModel/activity-model/activity_model/enriching_population.py b/01-ActivityModel/activity-model/activity_model/enriching_population.py new file mode 100644 index 0000000..13e89a5 --- /dev/null +++ b/01-ActivityModel/activity-model/activity_model/enriching_population.py @@ -0,0 +1,324 @@ +from random import choices +import zipfile +from causalinference import CausalModel +import numpy as np +import pandas as pd +from sklearn.neighbors import NearestNeighbors +import yaml +import seaborn as sns +import matplotlib.pyplot as plt +import os +from matplotlib.ticker import MaxNLocator + + +class EnrichingPopulation: + """Class to enrich a synthetic population. + + To create a enriched synthetic population, this class combines two + pandas.DataFrames using the Propensity Score Machting approach. + """ + + def __init__(self) -> None: + """Initialise an EnrichingPopulation class.""" + # Configure PSM related parameters from "config/psm.yaml" + psm_yaml = open("config/psm.yaml") + parsed_psm = yaml.load(psm_yaml, Loader=yaml.FullLoader) + self.n_neighbors = parsed_psm.get("n_neighbors") + self.overlap_columns = parsed_psm.get("overlap_columns") + self.matches_columns = parsed_psm.get("matches_columns") + + @staticmethod + def set_treatment(df0, df1): + """Create a "Treatment" column in each dataframe. + + :param df0: SPENSER data + :type df0: pandas.DataFrame + :param df1: EPC data + :type df1: pandas.DataFrame + :return: Two dataframes, first the SPENSER data + new column + ("Treatment" = 0), second the EPC data + new column ("Treatment" = 1) + :rtype: pandas.DataFrame, pandas.DataFrame + """ + + df0["Treatment"] = 0 + df1["Treatment"] = 1 + + return df0, df1 + + @staticmethod + def set_area_factor(df): + """Add a new Area column by factorizing the Area codes. + + :param df: Dataset with a Area column. + :type df: pandas.DataFrame + :return: Input dataset + new factorized Area column + :rtype: pandas.DataFrame + """ + Area_factor = df.Area.factorize() + df["Area_factor"] = Area_factor[0] + + return df + + @staticmethod + def get_propensity_score(df, overlap_columns): + """Return the propensity score values. + + :param df: complete dataframe + :type df: pandas.DataFrame + :param overlap_columns: list of columns names that are present in both + datasets (EPC and SPENSER). + :type overlap_columns: list + :return: list of propensity score for all rows. + :rtype: numpy.ndarray + """ + + ## Isolate the Y, X and the covariates + Y = df["Treatment"].copy() # 1-Dimension outcome - arbitrary values + X = df["Treatment"].copy() # 1-Dimension treatment + C = df[overlap_columns].copy() # n-Dimension covariates + + # Transform pandas dataframe into numpy.ndarray (CausalModel requisite) + Y = Y.values + X = X.values + C = C.values + + # Create the Causal Model + model = CausalModel(Y, X, C) + + # Propensity score calculation + model.est_propensity_s() + return model.propensity["fitted"] + + @staticmethod + def get_neighbors(df1, df2, n_neighbors): + """For each SPENSER row get a list of EPC rows with the closest propensity score values. + + :param df1: SPENSER dataset + :type df1: pandas.DataFrame + :param df2: EPC dataset + :type df2: pandas.DataFrame + :param n_neighbors: Number of neighbors. + :type n_neighbors: integer + :return: The propensity score difference and the indices of the closest neighbors. + :rtype: list, list + """ + # create the neighbors object (p=2 means Euclidean distance) + knn = NearestNeighbors(n_neighbors=n_neighbors, p=2).fit(df2[["ps"]]) + + # for each household in df1 dataframe, find the nearest df2 neighbors + distances, indices = knn.kneighbors(df1[["ps"]]) + return distances, indices + + @staticmethod + def get_matches(distances, indices, n_neighbors): + """From the neighbors list get one match for each SPENSER row. + + EPC rows with the same propensity score value have the same probability + of being matched with a SPENSER row. The greater the difference between + the propensity score values, the lower the probability of being drawn. + The weight function used, is a step function. + + :param distances: List of propensity score difference between the closest neighbors. + :type distances: list + :param indices: List of the closest neighbors indices. + :type indices: list + :param n_neighbors: Number of neighbors. + :type n_neighbors: integer + :return: List of assigned pairs. + :rtype: list + """ + pairs = [] + for index1, candidates2 in enumerate(indices): + is_zero = np.flatnonzero(distances[index1] == 0) + if is_zero.size < n_neighbors: + weight = 100 - (distances[index1] / distances[index1][-1] * 95) + index2 = choices(candidates2, weights=weight)[0] + else: + index2 = choices(candidates2)[0] + pairs.append([index1, index2]) + + return pairs + + @staticmethod + def get_enriched_pop(pairs, df1, df2, matches_columns): + """Returns the SPENSER enriched population. + + Combine the EPC data with the SPENSER data to generated a enriched + synthetic population. To combine the datasets, the propensity score + matching method is used. + + :param pairs: List of assigned pairs. + :type pairs: list + :param df1: SPENSER dataset. + :type df1: pandas.DataFrame + :param df2: EPC dataset. + :type df2: pandas.DataFrame + :param matches_columns: List of columns name from EPC to be incorporated + into SPENSER dataset. + :type matches_columns: list + :return: The enriched synthetic population + :rtype: pandas.DataFrame + """ + # Add matched df2 index id in df1 dataframe + matches = pd.DataFrame(pairs) + df1["EPCid"] = matches[1] + + drop_list = [ + "tenure", + "ps", + "Treatment", + "Area_factor", + *matches_columns, + ] + df1.drop(drop_list, axis=1, inplace=True) + + df2 = df2[matches_columns].copy() + df2["EPCid"] = df2.index + + df1 = pd.merge(df1, df2, on="EPCid", how="left") + df1.drop(["EPCid"], axis=1, inplace=True) + + return df1 + + @staticmethod + def save_enriched_pop(list_df_names, list_df): + """Save the synthetic population. + + Save the synthetic population into a zip file. Each local authority is + stored in a different csv file. + + :param list_df_names: Names of each csv file. + :type list_df_names: list + :param list_df: List of dataframes. + :type list_df: list + """ + if not (os.path.exists("data/output/")): + os.makedirs("data/output/") + + # save final population + csv_name = os.path.join("data/output/", "msm_epc_england.zip") + with zipfile.ZipFile(csv_name, "w") as csv_zip: + for i in range(len(list_df_names)): + csv_zip.writestr( + list_df_names[i], list_df[i].to_csv(index=False, header=True) + ) + + @staticmethod + def save_psm_fig(df, lad_code): + """Save the propensity score distribution image. + + This image is a way to visualize the compatibility between the two + datasets. + + :param df: SPENSER + EPC merged dataset. + :type df: pandas.DataFrame + :param lad_code: Local authority district code. + :type lad_code: string + """ + if not (os.path.exists("data/output/")): + os.makedirs("data/output/") + + sns.set(color_codes=True) + # Propensity score comparison plot + plt.figure(figsize=(10, 7), dpi=80) + sns.kdeplot(df["ps"], hue=df["Treatment"], shade=True) + + # * MSM: `Treatment` = 0 + # * EPC: `Treatment` = 1 + plt.xlabel("Propensity Score") + fig_name = "_".join([lad_code, "propensity_score.png"]) + fig_name = os.path.join("data/output/", fig_name) + plt.savefig(fig_name) + plt.close() + + @staticmethod + def save_validation_fig(df1, df2, lad_code): + """Save the internal validation image. + + Floor Area distribution and Accommodation age codes distribution + comparison between original EPC data and Enriched Synthetic population. + + :param df1: SPENSER dataset. + :type df1: pandas.DataFrame + :param df2: EPC dataset. + :type df2: pandas.DataFrame + :param lad_code: Local authority district code. + :type lad_code: string + """ + if not (os.path.exists("data/output/")): + os.makedirs("data/output/") + + fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(16, 10)) + + bins = list(range(1, 20, 1)) + df2.FLOOR_AREA.plot(kind="hist", ax=ax[0][0], title="EPC", bins=bins) + df2.ACCOM_AGE.plot(kind="hist", ax=ax[1][0], title="EPC", bins=10) + + df1.FLOOR_AREA.plot( + kind="hist", ax=ax[0][1], title="Enriched SPENSER", bins=bins + ) + df1.ACCOM_AGE.plot(kind="hist", ax=ax[1][1], title="Enriched SPENSER", bins=10) + + ax[0][0].xaxis.set_major_locator(MaxNLocator(integer=True)) + ax[0][1].xaxis.set_major_locator(MaxNLocator(integer=True)) + + ax[0][0].set_xlabel("Floor area code") + ax[0][1].set_xlabel("Floor area code") + + ax[1][0].set_xlabel("Accommodation age code") + ax[1][1].set_xlabel("Accommodation age code") + + fig.tight_layout(pad=3.0) + fig_name = "_".join([lad_code, "validation.png"]) + fig_name = os.path.join("data/output/", fig_name) + plt.savefig(fig_name) + plt.close() + + def step(self, df0, df1, lad_code, psm_fig=True, validation_fig=True): + """Enriching population main step. + + In this step the EPC data and the SPENSER data are combined to generate + an enriched synthetic population for a given local authority. + + :param df0: SPENSER dataset. + :type df0: pandas.DataFrame + :param df1: EPC dataset. + :type df1: pandas.DataFrame + :param lad_code: Local authority district code. + :type lad_code: string + :param psm_fig: Boolean to save the propensity score distribution image, defaults to True. + :type psm_fig: bool, optional + :param validation_fig: Boolean to save the internal validation image, defaults to True. + :type validation_fig: bool, optional + :return: Enriched synthetic population + :rtype: pandas.DataFrame + """ + df0, df1 = self.set_treatment(df0, df1) + dataset = pd.concat([df0, df1], ignore_index=True, sort=False) + dataset = self.set_area_factor(dataset) + dataset["ps"] = self.get_propensity_score(dataset, self.overlap_columns) + + # TODO instead save the image every step, the image should be stored in + # a list. The list should be saved as zip file at the end. + if psm_fig: + self.save_psm_fig(dataset, lad_code) + + # Separating EPC data from MSM data + df0 = dataset.loc[dataset.Treatment == 0].reset_index(drop=True) + df1 = dataset.loc[dataset.Treatment == 1].reset_index(drop=True) + del dataset + + # Get neighbors and matched pairs + distances, indices = self.get_neighbors(df0, df1, self.n_neighbors) + pairs = self.get_matches(distances, indices, self.n_neighbors) + del distances, indices + + # Get enriched population + rich_df = self.get_enriched_pop(pairs, df0, df1, self.matches_columns) + + # TODO instead save the image every step, the image should be stored in + # a list. The list should be saved as zip file at the end. + if validation_fig: + self.save_validation_fig(rich_df, df1, lad_code) + + return rich_df diff --git a/01-ActivityModel/activity-model/config/epc_api.yaml b/01-ActivityModel/activity-model/config/epc_api.yaml new file mode 100644 index 0000000..d3e572b --- /dev/null +++ b/01-ActivityModel/activity-model/config/epc_api.yaml @@ -0,0 +1,25 @@ +################################################################################ +# Insert in this file the necessary information to download EPC data using API. +# +################################################################################ + +# EPC credentials +epc_user: "user@email" +epc_key: "user_key" + +epc_url: "https://epc.opendatacommunities.org/api/v1/domestic/search?" + +epc_years: + - 2008 + - 2022 + +epc_headers: + - "postcode" + - "property-type" + - "built-form" + - "construction-age-band" + - "tenure" + - "total-floor-area" + - "mains-gas-flag" + - "building-reference-number" + - "lodgement-datetime" diff --git a/01-ActivityModel/activity-model/config/lad_codes.yaml b/01-ActivityModel/activity-model/config/lad_codes.yaml new file mode 100644 index 0000000..6f7054a --- /dev/null +++ b/01-ActivityModel/activity-model/config/lad_codes.yaml @@ -0,0 +1,8 @@ +################################################################################ +# Insert in this file the local authority codes (lad_codes) of your interest. +# The default is Haringey. +# +################################################################################ + +lad_codes: + - E09000014 \ No newline at end of file diff --git a/01-ActivityModel/activity-model/config/lookups.yaml b/01-ActivityModel/activity-model/config/lookups.yaml new file mode 100644 index 0000000..5c0e5eb --- /dev/null +++ b/01-ActivityModel/activity-model/config/lookups.yaml @@ -0,0 +1,235 @@ +################################################################################ +# In this file is listed all lookups used by the Activity Model to organize the +# EPC data to be merged with the Synthetic Population. +# +################################################################################ +# EPC data info: +# - EPCs issued from January 2008 up to and including 30 June 2021. +# - 21,440,172 Domestic EPCs +# - Download: 18 October 2021 +# +# * Note that new EPCs may include keys not listed here! +# +################################################################################ +# EPC Headers: +# Each file has 90 headers (keys). In this work, the used headers are: +# - POSTCODE +# - LOCAL_AUTHORITY +# - PROPERTY_TYPE + BUILT_FORM = Accommodation type +# - CONSTRUCTION_AGE_BAND +# - TENURE +# - TOTAL_FLOOR_AREA # m² +# - NUMBER_HABITABLE_ROOMS +# +################################################################################ +# Dictionary lookups: +# keys: based on the values present in the EPC +# keys_values: based on the values present in the Synthetic Population +# +################################################################################ +# List lookups (accommodation age and floor area) +# values: arbitrary chunks +# +# List layout: +# - +# - chunk desired code +# - chunk minimum value (not included) +# - chunk maximum value (included) +################################################################################ + +## Area lookup from: https://geoportal.statistics.gov.uk/ +area_url: "https://www.arcgis.com/sharing/rest/content/items/8a824519215947da99146692b0a0ff49/data" + +area_in_out: + - "pcds" + - "oa11cd" + +accommodation: + 'House: Detached': 2 + 'Bungalow: Detached': 2 + 'House: Semi-Detached': 3 + 'Bungalow: Semi-Detached': 3 + 'House: Mid-Terrace': 4 + 'House: End-Terrace': 4 + 'House: Enclosed Mid-Terrace': 4 + 'House: Enclosed End-Terrace': 4 + 'Bungalow: Mid-Terrace': 4 + 'Bungalow: End-Terrace': 4 + 'Bungalow: Enclosed Mid-Terrace': 4 + 'Bungalow: Enclosed End-Terrace': 4 + 'Flat: NO DATA!': 5 + 'Flat: Detached': 5 + 'Flat: Semi-Detached': 5 + 'Flat: Mid-Terrace': 5 + 'Flat: End-Terrace': 5 + 'Flat: Enclosed Mid-Terrace': 5 + 'Flat: Enclosed End-Terrace': 5 + 'Maisonette: NO DATA!': 5 + 'Maisonette: Detached': 5 + 'Maisonette: Semi-Detached': 5 + 'Maisonette: Mid-Terrace': 5 + 'Maisonette: End-Terrace': 5 + 'Maisonette: Enclosed Mid-Terrace': 5 + 'Maisonette: Enclosed End-Terrace': 5 + 'Park home: Detached': 5 + 'Park home: Semi-Detached': 5 + 'House: NO DATA!': null + 'Bungalow: NO DATA!': null + +age_categorical: + 'England and Wales: before 1900': 1 + 'England and Wales: 1900-1929': 1 + 'England and Wales: 1930-1949': 1930 + 'England and Wales: 1950-1966': 1950 + 'England and Wales: 1967-1975': 1967 + 'England and Wales: 1976-1982': 1976 + 'England and Wales: 1983-1990': 1983 + 'England and Wales: 1991-1995': 1991 + 'England and Wales: 1996-2002': 1996 + 'England and Wales: 2003-2006': 2003 + 'England and Wales: 2007-2011': 2007 + 'England and Wales: 2007 onwards': 2007 + 'England and Wales: 2012 onwards': 2007 + INVALID!: null + Not applicable: null + NO DATA!: null + + +gas: + N: 0 + Y: 1 + +tenure: + Owner-occupied: 1 + owner-occupied: 1 + rental (social): 5 + Rented (social): 5 + rental (private): 6 + Rented (private): 6 + Not defined - use in the case of a new dwelling for which the intended tenure in not known. It is no: null + unknown: null + NO DATA!: null + +age_numerical: + - + - 1 + - 0 + - 1929 + - + - 2 + - 1929 + - 1949 + - + - 3 + - 1949 + - 1966 + - + - 4 + - 1966 + - 1975 + - + - 5 + - 1975 + - 1982 + - + - 6 + - 1982 + - 1990 + - + - 7 + - 1990 + - 1995 + - + - 8 + - 1995 + - 2002 + - + - 9 + - 2002 + - 2006 + - + - 10 + - 2006 + - 3000 + +floor_area: + - + - 1 + - 0 + - 25 + - + - 2 + - 25 + - 50 + - + - 3 + - 50 + - 75 + - + - 4 + - 75 + - 100 + - + - 5 + - 100 + - 125 + - + - 6 + - 125 + - 150 + - + - 7 + - 150 + - 175 + - + - 8 + - 175 + - 200 + - + - 9 + - 200 + - 225 + - + - 10 + - 225 + - 250 + - + - 11 + - 250 + - 275 + - + - 12 + - 275 + - 300 + - + - 13 + - 300 + - 325 + - + - 14 + - 325 + - 350 + - + - 15 + - 350 + - 375 + - + - 16 + - 375 + - 400 + - + - 17 + - 400 + - 425 + - + - 18 + - 425 + - 450 + - + - 19 + - 450 + - 475 + - + - 20 + - 475 + - 500 diff --git a/01-ActivityModel/activity-model/config/psm.yaml b/01-ActivityModel/activity-model/config/psm.yaml new file mode 100644 index 0000000..7520d71 --- /dev/null +++ b/01-ActivityModel/activity-model/config/psm.yaml @@ -0,0 +1,17 @@ +################################################################################ +# Insert in this file the necessary information to configure the +# EnrichingPopulation class. +# +################################################################################ + +n_neighbors: 200 + +overlap_columns: + - "LC4402_C_TYPACCOM" + - "tenure" + - "Area_factor" + +matches_columns: + - "FLOOR_AREA" + - "GAS" + - "ACCOM_AGE" diff --git a/01-ActivityModel/activity-model/config/spenser.yaml b/01-ActivityModel/activity-model/config/spenser.yaml new file mode 100644 index 0000000..a191483 --- /dev/null +++ b/01-ActivityModel/activity-model/config/spenser.yaml @@ -0,0 +1,6 @@ +################################################################################ +# Insert in this file the necessary information to download SPENSER data. +# +################################################################################ + +spenser_url: "https://osf.io/623qz/download" \ No newline at end of file diff --git a/01-ActivityModel/activity-model/environment.yml b/01-ActivityModel/activity-model/environment.yml new file mode 100644 index 0000000..7175bb4 --- /dev/null +++ b/01-ActivityModel/activity-model/environment.yml @@ -0,0 +1,21 @@ +name: energyflex1 +channels: + - conda-forge + - defaults +dependencies: + - matplotlib=3.5.1 + - numpy=1.21.2 + - pandas=1.4.1 + - python=3.9.7 + - pip=21.2.4 + - pyyaml=6.0 + - requests=2.27.1 + - scikit-learn=1.0.2 + - seaborn=0.11.2 + - setuptools=58.0.4 + - pytest=6.2.5 + - black=19.10b0 + - sphinx=4.4.0 + - sphinx_rtd_theme=0.4.3 + - pip: + - causalinference==0.1.3 diff --git a/01-ActivityModel/activity-model/setup.py b/01-ActivityModel/activity-model/setup.py new file mode 100644 index 0000000..86456ec --- /dev/null +++ b/01-ActivityModel/activity-model/setup.py @@ -0,0 +1,32 @@ +import setuptools + +with open("README.md", "r") as fh: + long_description = fh.read() + +setuptools.setup( + name="activity-model", + version="0.1.0", + author="Patricia Ternes", + author_email="p.ternesdallagnollo@leeds.ac.uk", + description="The Activity Model package", + long_description=long_description, + long_description_content_type="text/markdown", + # url="#", + # install_requires=[ + # "numpy=1.21.2", + # "pandas=1.4.1", + # "requests=2.27.1", + # "causalinference==0.1.3", + # "scikit-learn=1.0.2", + # "seaborn=0.11.2", + # "matplotlib=3.5.1", + # "pyyaml=6.0", + # ] + packages=setuptools.find_packages(), + classifiers=[ + "Development Status :: 3 - Alpha", + "Programming Language :: Python :: 3.9", + "Intended Audience :: Science/Research", + ], + python_requires=">=3.9", +) diff --git a/01-ActivityModel/activity-model/tests/__init__.py b/01-ActivityModel/activity-model/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/01-ActivityModel/activity-model/tests/test_activity_model.py b/01-ActivityModel/activity-model/tests/test_activity_model.py new file mode 100644 index 0000000..6d0a755 --- /dev/null +++ b/01-ActivityModel/activity-model/tests/test_activity_model.py @@ -0,0 +1,48 @@ +from activity_model import __version__ +from activity_model.data_preparation import Epc + +import requests +import pytest + + +def test_version(): + assert __version__ == "0.1.0" + + +@pytest.fixture +def epc(): + epc = Epc() + return epc + + +def test_lookup_type(epc): + assert type(epc.accommodation_lookup) is dict + assert type(epc.age_categorical_lookup) is dict + assert type(epc.gas_lookup) is dict + assert type(epc.tenure_lookup) is dict + assert type(epc.age_numerical_lookup) is list + assert type(epc.floor_area_lookup) is list + assert type(epc.area_lookup) is dict + + +def test_epc_connection(epc): + url = epc.epc_url + user = epc.epc_user + key = epc.epc_key + headers = {"Accept": "text/csv"} + + r = requests.head(url, headers=headers, auth=(user, key)) + assert ( + r.status_code == 200 + ), "Please check your EPC credentials here: config/epc_api.yaml" + + +# test area lookup connection? +# test spenser connection? + +# test if area column has the right values +# test if floor area has the right values +# test if age has the right values +# test if gas has the right values +# test if tenure has the right values +# test if accommodation type has the right values