From db71c0cdc2f3648faa062e2a1f17fa9c20446d33 Mon Sep 17 00:00:00 2001
From: Patricia Ternes <p.ternesdallagnollo@leeds.ac.uk>
Date: Mon, 11 Apr 2022 14:45:15 +0100
Subject: [PATCH 1/2] add code for activity model

---
 .gitignore                                    |   1 +
 01-ActivityModel/activity-model/README.md     | 168 +++++++++
 .../activity-model/activity_model/__init__.py |   1 +
 .../activity-model/activity_model/__main__.py |  28 ++
 .../activity_model/data_preparation.py        | 350 ++++++++++++++++++
 .../activity_model/enriching_population.py    | 324 ++++++++++++++++
 .../activity-model/config/epc_api.yaml        |  25 ++
 .../activity-model/config/lad_codes.yaml      |   8 +
 .../activity-model/config/lookups.yaml        | 235 ++++++++++++
 .../activity-model/config/psm.yaml            |  17 +
 .../activity-model/config/spenser.yaml        |   6 +
 .../activity-model/environment.yml            |  21 ++
 01-ActivityModel/activity-model/setup.py      |  32 ++
 .../activity-model/tests/__init__.py          |   0
 .../tests/test_activity_model.py              |  48 +++
 15 files changed, 1264 insertions(+)
 create mode 100644 01-ActivityModel/activity-model/README.md
 create mode 100644 01-ActivityModel/activity-model/activity_model/__init__.py
 create mode 100644 01-ActivityModel/activity-model/activity_model/__main__.py
 create mode 100644 01-ActivityModel/activity-model/activity_model/data_preparation.py
 create mode 100644 01-ActivityModel/activity-model/activity_model/enriching_population.py
 create mode 100644 01-ActivityModel/activity-model/config/epc_api.yaml
 create mode 100644 01-ActivityModel/activity-model/config/lad_codes.yaml
 create mode 100644 01-ActivityModel/activity-model/config/lookups.yaml
 create mode 100644 01-ActivityModel/activity-model/config/psm.yaml
 create mode 100644 01-ActivityModel/activity-model/config/spenser.yaml
 create mode 100644 01-ActivityModel/activity-model/environment.yml
 create mode 100644 01-ActivityModel/activity-model/setup.py
 create mode 100644 01-ActivityModel/activity-model/tests/__init__.py
 create mode 100644 01-ActivityModel/activity-model/tests/test_activity_model.py

diff --git a/.gitignore b/.gitignore
index 30f27b9..38965d1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 .vscode/*
 *.ipynb_checkpoints
+*.pytest_cache*
 01-ActivityModel/data/*
 *all-domestic-certificates.zip
 *epc_england.zip
diff --git a/01-ActivityModel/activity-model/README.md b/01-ActivityModel/activity-model/README.md
new file mode 100644
index 0000000..5675d08
--- /dev/null
+++ b/01-ActivityModel/activity-model/README.md
@@ -0,0 +1,168 @@
+# Activity Model
+
+This is the code repository for the Activity Model package.
+
+The Activity Model returns a synthetic population to represent the England
+household population. This model, however, does not create the population from
+scratch, it uses a well-known synthetic population, the SPENSER, and adds the
+following information for each household:
+
+- Accommodation floor area (band)
+- Accommodation age (band)
+- Gas (flag: Y/N)
+
+The accommodation information is originally obtained from Domestic Energy
+Performance Certificates (EPC) and then, codified in this package.
+
+To enrich the SPENSER population with the EPC data, the Propensity Score
+Matching (PSM) method is applied.
+
+The main output is an enriched synthetic population that we use as input for
+energy estimation models developed by the Energy Flexibility Project.
+
+## Environment setup
+
+This package currently supports running on Linux.  <!-- and macOS. -->
+
+To start working with this repository you need to clone it onto your local
+machine:
+
+```bash
+$ git clone https://github.com/anetobradley/energy_flex.git
+$ cd energy_flex/01-ActivityModel
+```
+
+This package requires a specific
+[conda](https://docs.anaconda.com/anaconda/install/) environment.
+You can create an environment for this project using the provided
+environment file:
+
+```bash
+$ conda env create -f environment.yml
+$ conda activate energyflex
+```
+
+## Configuring the model
+
+### Required
+
+#### EPC Credentials
+
+To retrieve data to run the model you will need to have EPC-API credentials.
+You can register [here](https://epc.opendatacommunities.org/#register).
+Next you need to add your credentials into the
+[epc_api](./config/epc_api.yaml) file (you can use your favourite text
+editor for this):
+
+```bash
+$ nano config/user.yaml
+#  EPC credentials
+epc_user: "user@email"
+epc_key: "user_key"
+```
+
+#### Local Authority codes
+
+You need provide the code for all Local Authorities that you want a synthetic
+population. Please, insert the values [here](./config/lad_codes.yaml).
+If you not provide any additional value, the default is return the population
+just for Haringey.
+
+You can find
+[here](https://epc.opendatacommunities.org/docs/api/domestic#domestic-local-authority)
+all LAD codes available in the EPC data.
+
+### Optional
+
+#### Year
+
+You can define [here](./config/epc_api.yaml) a different range of the EPC
+lodgement date (the default is 2008-2022).
+
+#### EPC variables
+
+If you want to enrich the synthetic population with more EPC variables you
+need to add them in two lists:
+
+- [epc_api config file](./config/epc_api.yaml) under `epc_headers`.
+- [psm config file](./config/psm.yaml) under `matches_columns`.
+
+You can find a complete EPC Glossary
+[here](https://epc.opendatacommunities.org/docs/guidance#glossary),
+but be aware that there is a difference between the spellings of the terms
+described in this list and how they are used in the API. In our experience the
+differences are:
+
+- capital letters must be written in lowercase letters.
+- underscore must be replaced by a hyphen.
+
+We also warn that most of the information is unencoded, which can make it
+difficult to use (as well as making the output file unnecessarily large).
+The default variables (accommodation floor area, accommodation age, gas)
+are properly encoded and organized by this package.
+
+#### Data url
+
+Three dataset are obtained through urls:
+
+- EPC data
+- SPENSER data
+- Area lookup data
+
+If you want to use different urls, you can change then in:
+
+- EPC url [here](./config/epc_api.yaml) under `epc_url`
+- SPENSER url [here](./config/spenser.yaml) under `spenser_url`
+- Area lookup url [here](./config/lookups.yaml) under `area_url`
+
+Note: You ca obtain data from other places, after all new
+versions are expected, but it is necessary to ensure that the data structure
+is similar or the code will not work.
+
+#### Area granularity
+
+The default granularity is Output Areas, but you can use others, like:
+
+- Lower Layer Super Output Areas (`lsoa11cd`)
+- Middle Layer Super Output Areas (`msoa11cd`)
+- Local authority districts (`ladcd`)
+
+To change this, please use the `area_in_out` variable
+[here](./config/lookups.yaml).
+
+Note that if you change the Area lookup url, the granularities code may also
+change!
+
+## Installation & Usage
+
+Next we install the Activity Model package into the environment using `setup.py`:
+
+```bash
+# for using the code base use
+$ python setup.py install
+```
+
+## Running the model
+
+If you installed the package with the `setup.py` file, to run the model:
+
+```bash
+$ python activity_model
+```
+
+If you did not install the package with the `setup.py` file, you can run the
+code through
+
+```bash
+# for using the code base use
+$ python activity_model/__main__.py
+```
+
+## Outputs
+
+The outputs are stored at `data/output/`. Three outputs are expected:
+
+1. Propensity score distribution images for each local authority.
+2. Internal validation images for each local authority.
+3. Enriched synthetic population for each local authority (CSV file).
+   All CSV files are compressed into a zip file.
diff --git a/01-ActivityModel/activity-model/activity_model/__init__.py b/01-ActivityModel/activity-model/activity_model/__init__.py
new file mode 100644
index 0000000..b794fd4
--- /dev/null
+++ b/01-ActivityModel/activity-model/activity_model/__init__.py
@@ -0,0 +1 @@
+__version__ = '0.1.0'
diff --git a/01-ActivityModel/activity-model/activity_model/__main__.py b/01-ActivityModel/activity-model/activity_model/__main__.py
new file mode 100644
index 0000000..83d09f3
--- /dev/null
+++ b/01-ActivityModel/activity-model/activity_model/__main__.py
@@ -0,0 +1,28 @@
+import yaml
+
+from data_preparation import Epc, Spenser
+from enriching_population import EnrichingPopulation
+
+if __name__ == "__main__":
+    print("hi")
+    spenser = Spenser()
+    epc = Epc()
+    psm = EnrichingPopulation()
+
+    list_df = []
+    list_df_names = []
+    lad_codes_yaml = open("config/lad_codes.yaml")
+    parsed_lad_codes = yaml.load(lad_codes_yaml, Loader=yaml.FullLoader)
+    lad_codes = parsed_lad_codes.get("lad_codes")
+
+    for lad_code in lad_codes:
+        spenser_df = spenser.step(lad_code)
+        epc_df = epc.step(lad_code)
+        rich_df = psm.step(
+            spenser_df, epc_df, lad_code, psm_fig=True, validation_fig=True
+        )
+        list_df_names.append("_".join([lad_code, "hh_msm_epc.csv"]))
+        list_df.append(rich_df)
+
+    psm.save_enriched_pop(list_df_names, list_df)
+
diff --git a/01-ActivityModel/activity-model/activity_model/data_preparation.py b/01-ActivityModel/activity-model/activity_model/data_preparation.py
new file mode 100644
index 0000000..7e96b17
--- /dev/null
+++ b/01-ActivityModel/activity-model/activity_model/data_preparation.py
@@ -0,0 +1,350 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import yaml
+import pandas as pd
+import requests
+import io
+import zipfile
+
+
+class Epc:
+    """Class to represent the SPENSER data and related parameters/methods."""
+
+    def __init__(self) -> None:
+        """Initialise an EPC class."""
+        # Configure epc api related parameters from "config/epc_api.yaml"
+        epc_api_yaml = open("config/epc_api.yaml")
+        parsed_epc_api = yaml.load(epc_api_yaml, Loader=yaml.FullLoader)
+        self.epc_user = parsed_epc_api.get("epc_user")
+        self.epc_key = parsed_epc_api.get("epc_key")
+        self.epc_url = parsed_epc_api.get("epc_url")
+        self.epc_years = parsed_epc_api.get("epc_years")
+        self.desired_headers = parsed_epc_api.get("epc_headers")
+
+        # Using epc api info to build all base url-filter
+        self.epc_filter = self.get_epc_url_filter()
+
+        # Configure lookups
+        ## Lookups from "config/lookups.yaml" file
+        lookup_yaml = open("config/lookups.yaml")
+        parsed_lookup = yaml.load(lookup_yaml, Loader=yaml.FullLoader)
+        self.accommodation_lookup = parsed_lookup.get("accommodation")
+        self.age_categorical_lookup = parsed_lookup.get("age_categorical")
+        self.age_numerical_lookup = parsed_lookup.get("age_numerical")
+        self.floor_area_lookup = parsed_lookup.get("floor_area")
+        self.gas_lookup = parsed_lookup.get("gas")
+        self.tenure_lookup = parsed_lookup.get("tenure")
+        url = parsed_lookup.get("area_url")
+        area_in_out = parsed_lookup.get("area_in_out")
+        area_lookup = pd.read_csv(
+            url,
+            compression="zip",
+            # usecols=[area_in, area_out],
+            usecols=[area_in_out[0], area_in_out[1]],
+            encoding="unicode_escape",
+            engine="python",
+        )
+        self.area_lookup = (
+            area_lookup.set_index(area_in_out[0], drop=True)
+            .loc[:, area_in_out[1]]
+            .to_dict()
+        )
+
+    def get_epc_url_filter(self):
+        """Build a list of EPC search filters urls.
+
+        According to EPC-API
+        [documentation](https://epc.opendatacommunities.org/docs/api/domestic)
+        the API is designed to return up to 10,000 records at a time, with a
+        maximum page size of 5,000. If more than 10,000 records are required,
+        is necessary to vary the search filters and make multiple requests.
+
+        This method returns a list of filter urls to get the maximum possible
+        volume of data. Each filter url covers 4 months.
+
+        :return: EPC-API urls with filters.
+        :rtype: list
+        """
+
+        url_filter = []
+        for i in range(self.epc_years[0], self.epc_years[1], 1):
+            for j in range(3):
+                for k in range(2):
+                    search = f"size=5000&from-year={i}&from-month={(j*4)+1}&to-year={i}&to-month={(j+1)*4}&from={k*5000}&local-authority="
+                    url_filter.append(self.epc_url + search)
+        return url_filter
+
+    def get_epc_dataframe(self, lad_code) -> pd.DataFrame:
+        """Get EPC data for a given local authority.
+
+        This function uses the EPC-API to get a large amount of data.
+        Due to data limitation per request, several filters are considered.
+
+        Note 1: You need insert a valid EPC user/key (config/epc_api.yaml)
+
+        Note 2: Some data interval return Null value (usually in early 2008) and
+        an exception is used to avoid errors in this case.
+
+        :param lad_code: Local authority code.
+        :type lad_code: string
+        :return: A data frame with all EPC collected data.
+        :rtype: pandas.DataFrame
+        """
+        url_filter = [s + lad_code for s in self.epc_filter]
+        headers = {"Accept": "text/csv"}
+        list_df = []
+        for url in url_filter:
+            try:
+                res = requests.get(
+                    url, headers=headers, auth=(self.epc_user, self.epc_key)
+                ).content
+                df = pd.read_csv(
+                    io.StringIO(res.decode("utf-8")), usecols=self.desired_headers
+                )
+                list_df.append(df)
+            except pd.errors.EmptyDataError:
+                """
+                Some data interval return Null value (usually in early 2008).
+                This Exception is raised to avoid errors in this situation.
+                Warning: Problems in EPC-API may be difficult to follow.
+                """
+                pass
+        return pd.concat(list_df)
+
+    @staticmethod
+    def remove_duplicates(df):
+        """Remove EPC Duplicate Certificates
+
+        When using the EPC datasets we need to be careful with duplicate EPCs
+        for the same property. While not an enormous issue as an EPC is valid
+        for up to 10 years unless the property is renovated or retrofitted,
+        there may be multiple records especially for rental properties which are
+        improved to meet recent regulations.
+
+        This function removing duplicates with the same BUILDING REFERENCE 
+        NUMBER by selecting the most recent record and discarding others.
+        
+        :param df: Raw EPC dataset.
+        :type df: pandas.DataFrame
+        :return: EPC dataset without duplicate Certificates.
+        :rtype: pandas.DataFrame
+        """
+        df["lodgement-datetime"] = pd.to_datetime(df["lodgement-datetime"])
+        df = df.sort_values(by=["building-reference-number", "lodgement-datetime"])
+        df.drop_duplicates(
+            subset=["building-reference-number"], keep="last", inplace=True
+        )
+        df.sort_index(inplace=True)
+        df.reset_index(drop=True, inplace=True)
+        drop_list = ["building-reference-number", "lodgement-datetime"]
+        df.drop(drop_list, axis=1, inplace=True)
+        return df
+
+    @staticmethod
+    def set_categorical_code(df, df_col, lookup, rename=False):
+        """ Apply the lookup to a categorical column.
+        
+        Transform the values in a dataframe column using a lookup dictionary.
+        This method is valid when the column values are categorical.
+
+        :param df:  The input dataframe.
+        :type df: pandas.dataframe
+        :param df_col: The column in df that represents the categorical values.
+        :type df_col: string
+        :param lookup: A dictionary from categorical values to categorical codes.
+        :type lookup: dict
+        :param rename: The new column name after transformation (if false, keep 
+            the current name), defaults to False.
+        :type rename: bool, optional
+        :return: Returns the data with the updated column.
+        :rtype: pandas.DataFrame
+        """
+
+        # This looks redundant, but ensures that the function works even for
+        # missing values (returning empty code).
+        def augment(x, lookup):
+            try:
+                return lookup[x]
+            except:
+                return
+
+        # setting new values according the rename_dict
+        df[df_col] = df[df_col].apply(func=lambda x: augment(x, lookup))
+
+        # remove empty rows
+        df.dropna(subset=[df_col], inplace=True)
+
+        # rename column
+        if rename:
+            df.rename({df_col: rename}, axis=1, inplace=True)
+
+    @staticmethod
+    def set_numerical_code(df, df_col, lookup, rename=False):
+        """Apply the lookup to a numerical column
+
+        Transform the values in a dataframe column using a lookup dictionary.
+        This method is valid when the column values are numerical, following 
+        the rule:
+        
+        if (j < value <= k), then, (value = i).
+
+        :param df: The input dataframe.
+        :type df: pandas.dataframe
+        :param df_col: The column in df that represents the numerical values.
+        :type df_col: string
+        :param lookup: A dictionary from numerical values to numerical codes;
+            The dictionary structure is [[i1, j1, k1], [i2, j2, k2], ..., 
+            [iN, jN, kN]], where: iN is the desired code for band N, jN is the
+            minimum value of the band N (not included), kN is the maximum value
+            of the band N (included), and N is the number of bands.
+        :type lookup: dict
+        :param rename: The new column name after transformation (if false, keep
+            the current name), defaults to False.
+        :type rename: bool, optional
+        """
+        for band in lookup:
+            df.loc[(df[df_col] > band[1]) & (df[df_col] <= band[2]), df_col] = band[0]
+
+        # remove out bound and empty rows
+        df.dropna(subset=[df_col], inplace=True)
+
+        if rename:
+            df.rename({df_col: rename}, axis=1, inplace=True)
+
+    def set_lookups(self, df):
+        """Update all columns using the lookups dictionaries.
+
+        Update the information related with area, tenure, accommodation type,
+        construction age band, main gas flag, and floor area, by using the self
+        lookup variables (accommodation_lookup, age_categorical_lookup,
+        age_numerical_lookup, floor_area_lookup, gas_lookup, tenure_lookup,
+        area_lookup) and the set_categorical_code and set_numerical_code
+        functions.
+
+        :param df: Dataframe with EPC information.
+        :type df: pandas.Dataframe
+        """
+        # Area: change area from postcode to output area
+        self.set_categorical_code(df, "postcode", self.area_lookup, rename="Area")
+
+        # Tenure: change the tenure from EPC to SPENSER classification
+        self.set_categorical_code(df, "tenure", self.tenure_lookup)
+
+        # Accommodation type:
+        # - create an EPC accommodation type by combining "property-type" and "built-form"
+        # - change the accommodation type from EPC to SPENSER classification
+        # - discard "property-type" and "built-form" columns
+        df["LC4402_C_TYPACCOM"] = df["property-type"] + ": " + df["built-form"]
+        self.set_categorical_code(df, "LC4402_C_TYPACCOM", self.accommodation_lookup)
+        df.pop("property-type")
+        df.pop("built-form")
+
+        # Construction age band:
+        # - initially is a combination of categorical and numeric values
+        # - convert all categorical values into absolute ages
+        # - groups the absolute build age into bands
+        self.set_categorical_code(
+            df,
+            "construction-age-band",
+            self.age_categorical_lookup,
+            rename="ACCOM_AGE",
+        )
+        df["ACCOM_AGE"] = df["ACCOM_AGE"].apply(pd.to_numeric)
+        self.set_numerical_code(df, "ACCOM_AGE", self.age_numerical_lookup)
+
+        # Main gas flag: change the values (N, Y) to (0, 1)
+        self.set_categorical_code(df, "mains-gas-flag", self.gas_lookup, rename="GAS")
+
+        # Floor Area: groups the absolute area into bands
+        area_max_lim = self.floor_area_lookup[-1][2]
+        df.rename({"total-floor-area": "FLOOR_AREA"}, axis=1, inplace=True)
+        df.drop(df[df.FLOOR_AREA > area_max_lim].index, inplace=True)
+        self.set_numerical_code(df, "FLOOR_AREA", self.floor_area_lookup)
+
+    def step(self, lad_code):
+        """EPC data preparation main step.
+
+        For each given local authority, this functions get the raw EPC data
+        using an API approach and then return a processed EPC dataset.
+
+        :param lad_code: Local authority code.
+        :type lad_code: string
+        :return: processed EPC data
+        :rtype: pandas.DataFrame
+        """
+        # Create EPC dataframe for local authority lad_code
+        df = self.get_epc_dataframe(lad_code)
+
+        df = self.remove_duplicates(df)
+
+        # Apply all lookups
+        self.set_lookups(df)
+
+        # Change selected columns to integer values
+        cols = ["FLOOR_AREA", "ACCOM_AGE", "GAS", "tenure", "LC4402_C_TYPACCOM"]
+        df[cols] = df[cols].applymap(np.int64)
+        return df
+
+
+class Spenser:
+    """Class to represent the SPENSER data and related parameters/methods."""
+
+    def __init__(self) -> None:
+        """Initialise a Spenser class."""
+        # Configure SPENSER related parameters from "config/spenser.yaml"
+        spenser_yaml = open("config/spenser.yaml")
+        parsed_spenser = yaml.load(spenser_yaml, Loader=yaml.FullLoader)
+        spenser_url = parsed_spenser.get("spenser_url")
+        r = requests.get(spenser_url)
+        self.spenser_zip_file = zipfile.ZipFile(io.BytesIO(r.content))
+
+    def set_new_tenure(self, df) -> pd.DataFrame:
+        """Create new temporary tenure column
+
+        This method creates a new tenure column (following EPC values) where
+        the sub-categories 
+        - "Owned outright"(=2)
+        - shared ownership" (=3)
+        are merged into a general "Owner-occupied" (=1) category.
+
+        :param df: original SPENSER data frame
+        :type df: pandas.Dataframe
+        :return: SPENSER data frame with a new column
+        :rtype: pandas.DataFrame
+        """
+        df["tenure"] = df["LC4402_C_TENHUK11"].copy()
+        df.loc[(df["tenure"] == 2), "tenure"] = 1
+        df.loc[(df["tenure"] == 3), "tenure"] = 1
+        df["tenure"] = df["tenure"].map(np.int64)
+
+        return df
+
+    def step(self, lad_code):
+        """SPENSER data preparation main step.
+
+        For each given local authority, this functions get the raw EPC data
+        from a zip file and then return a processed SPENSER dataset.
+
+        :param lad_code: Local authority code.
+        :type lad_code: string
+        :return: processed SPENSER data
+        :rtype: pandas.DataFrame
+        """
+
+        # From the zipfile - open the local authority file
+        lad_file = "_".join(["msm_england/ass_hh", lad_code, "OA11_2020.csv"])
+        df = pd.read_csv(self.spenser_zip_file.open(lad_file))
+
+        # Remove "empty" rows: empty codes (here, negative values) are a problem
+        # for PSM method.
+        # TODO: store the "empty" rows in other variable to be possible append
+        # then at the end.
+        df.drop(df[df.LC4402_C_TENHUK11 < 0].index, inplace=True)
+        df.drop(df[df.LC4402_C_TYPACCOM < 0].index, inplace=True)
+
+        # create new tenure
+        df = self.set_new_tenure(df)
+
+        return df
diff --git a/01-ActivityModel/activity-model/activity_model/enriching_population.py b/01-ActivityModel/activity-model/activity_model/enriching_population.py
new file mode 100644
index 0000000..13e89a5
--- /dev/null
+++ b/01-ActivityModel/activity-model/activity_model/enriching_population.py
@@ -0,0 +1,324 @@
+from random import choices
+import zipfile
+from causalinference import CausalModel
+import numpy as np
+import pandas as pd
+from sklearn.neighbors import NearestNeighbors
+import yaml
+import seaborn as sns
+import matplotlib.pyplot as plt
+import os
+from matplotlib.ticker import MaxNLocator
+
+
+class EnrichingPopulation:
+    """Class to enrich a synthetic population.
+
+    To create a enriched synthetic population, this class combines two
+    pandas.DataFrames using the Propensity Score Machting approach.
+    """
+
+    def __init__(self) -> None:
+        """Initialise an EnrichingPopulation class."""
+        # Configure PSM related parameters from "config/psm.yaml"
+        psm_yaml = open("config/psm.yaml")
+        parsed_psm = yaml.load(psm_yaml, Loader=yaml.FullLoader)
+        self.n_neighbors = parsed_psm.get("n_neighbors")
+        self.overlap_columns = parsed_psm.get("overlap_columns")
+        self.matches_columns = parsed_psm.get("matches_columns")
+
+    @staticmethod
+    def set_treatment(df0, df1):
+        """Create a "Treatment" column in each dataframe.
+
+        :param df0: SPENSER data
+        :type df0: pandas.DataFrame
+        :param df1: EPC data
+        :type df1: pandas.DataFrame
+        :return: Two dataframes, first the SPENSER data + new column 
+            ("Treatment" = 0), second the EPC data + new column ("Treatment" = 1)
+        :rtype: pandas.DataFrame, pandas.DataFrame
+        """
+
+        df0["Treatment"] = 0
+        df1["Treatment"] = 1
+
+        return df0, df1
+
+    @staticmethod
+    def set_area_factor(df):
+        """Add a new Area column by factorizing the Area codes.
+
+        :param df: Dataset with a Area column.
+        :type df: pandas.DataFrame
+        :return: Input dataset + new factorized Area column
+        :rtype: pandas.DataFrame
+        """
+        Area_factor = df.Area.factorize()
+        df["Area_factor"] = Area_factor[0]
+
+        return df
+
+    @staticmethod
+    def get_propensity_score(df, overlap_columns):
+        """Return the propensity score values.
+
+        :param df: complete dataframe
+        :type df: pandas.DataFrame
+        :param overlap_columns: list of columns names that are present in both
+            datasets (EPC and SPENSER).
+        :type overlap_columns: list
+        :return: list of propensity score for all rows.
+        :rtype: numpy.ndarray
+        """
+
+        ## Isolate the Y, X and the covariates
+        Y = df["Treatment"].copy()  # 1-Dimension outcome - arbitrary values
+        X = df["Treatment"].copy()  # 1-Dimension treatment
+        C = df[overlap_columns].copy()  # n-Dimension covariates
+
+        # Transform pandas dataframe into numpy.ndarray (CausalModel requisite)
+        Y = Y.values
+        X = X.values
+        C = C.values
+
+        # Create the Causal Model
+        model = CausalModel(Y, X, C)
+
+        # Propensity score calculation
+        model.est_propensity_s()
+        return model.propensity["fitted"]
+
+    @staticmethod
+    def get_neighbors(df1, df2, n_neighbors):
+        """For each SPENSER row get a list of EPC rows with the closest propensity score values.
+
+        :param df1: SPENSER dataset
+        :type df1: pandas.DataFrame
+        :param df2: EPC dataset
+        :type df2: pandas.DataFrame
+        :param n_neighbors: Number of neighbors.
+        :type n_neighbors: integer
+        :return: The propensity score difference and the indices of the closest neighbors.
+        :rtype: list, list
+        """
+        # create the neighbors object (p=2 means Euclidean distance)
+        knn = NearestNeighbors(n_neighbors=n_neighbors, p=2).fit(df2[["ps"]])
+
+        # for each household in df1 dataframe, find the nearest df2 neighbors
+        distances, indices = knn.kneighbors(df1[["ps"]])
+        return distances, indices
+
+    @staticmethod
+    def get_matches(distances, indices, n_neighbors):
+        """From the neighbors list get one match for each SPENSER row.
+
+        EPC rows with the same propensity score value have the same probability
+        of being matched with a SPENSER row. The greater the difference between
+        the propensity score values, the lower the probability of being drawn.
+        The weight function used, is a step function.
+
+        :param distances: List of propensity score difference between the closest neighbors.
+        :type distances: list
+        :param indices: List of the closest neighbors indices.
+        :type indices: list
+        :param n_neighbors: Number of neighbors.
+        :type n_neighbors: integer
+        :return: List of assigned pairs.
+        :rtype: list
+        """
+        pairs = []
+        for index1, candidates2 in enumerate(indices):
+            is_zero = np.flatnonzero(distances[index1] == 0)
+            if is_zero.size < n_neighbors:
+                weight = 100 - (distances[index1] / distances[index1][-1] * 95)
+                index2 = choices(candidates2, weights=weight)[0]
+            else:
+                index2 = choices(candidates2)[0]
+            pairs.append([index1, index2])
+
+        return pairs
+
+    @staticmethod
+    def get_enriched_pop(pairs, df1, df2, matches_columns):
+        """Returns the SPENSER enriched population.
+
+        Combine the EPC data with the SPENSER data to generated a enriched
+        synthetic population. To combine the datasets, the propensity score
+        matching method is used.
+
+        :param pairs: List of assigned pairs.
+        :type pairs: list
+        :param df1: SPENSER dataset.
+        :type df1: pandas.DataFrame
+        :param df2: EPC dataset.
+        :type df2: pandas.DataFrame
+        :param matches_columns: List of columns name from EPC to be incorporated
+            into SPENSER dataset.
+        :type matches_columns: list
+        :return: The enriched synthetic population
+        :rtype: pandas.DataFrame
+        """
+        # Add matched df2 index id in df1 dataframe
+        matches = pd.DataFrame(pairs)
+        df1["EPCid"] = matches[1]
+
+        drop_list = [
+            "tenure",
+            "ps",
+            "Treatment",
+            "Area_factor",
+            *matches_columns,
+        ]
+        df1.drop(drop_list, axis=1, inplace=True)
+
+        df2 = df2[matches_columns].copy()
+        df2["EPCid"] = df2.index
+
+        df1 = pd.merge(df1, df2, on="EPCid", how="left")
+        df1.drop(["EPCid"], axis=1, inplace=True)
+
+        return df1
+
+    @staticmethod
+    def save_enriched_pop(list_df_names, list_df):
+        """Save the synthetic population.
+
+        Save the synthetic population into a zip file. Each local authority is
+        stored in a different csv file.
+
+        :param list_df_names: Names of each csv file.
+        :type list_df_names: list
+        :param list_df: List of dataframes.
+        :type list_df: list
+        """
+        if not (os.path.exists("data/output/")):
+            os.makedirs("data/output/")
+
+        # save final population
+        csv_name = os.path.join("data/output/", "msm_epc_england.zip")
+        with zipfile.ZipFile(csv_name, "w") as csv_zip:
+            for i in range(len(list_df_names)):
+                csv_zip.writestr(
+                    list_df_names[i], list_df[i].to_csv(index=False, header=True)
+                )
+
+    @staticmethod
+    def save_psm_fig(df, lad_code):
+        """Save the propensity score distribution image.
+
+        This image is a way to visualize the compatibility between the two
+        datasets.
+
+        :param df: SPENSER + EPC merged dataset.
+        :type df: pandas.DataFrame
+        :param lad_code: Local authority district code.
+        :type lad_code: string
+        """
+        if not (os.path.exists("data/output/")):
+            os.makedirs("data/output/")
+
+        sns.set(color_codes=True)
+        # Propensity score comparison plot
+        plt.figure(figsize=(10, 7), dpi=80)
+        sns.kdeplot(df["ps"], hue=df["Treatment"], shade=True)
+
+        # * MSM: `Treatment` = 0
+        # * EPC: `Treatment` = 1
+        plt.xlabel("Propensity Score")
+        fig_name = "_".join([lad_code, "propensity_score.png"])
+        fig_name = os.path.join("data/output/", fig_name)
+        plt.savefig(fig_name)
+        plt.close()
+
+    @staticmethod
+    def save_validation_fig(df1, df2, lad_code):
+        """Save the internal validation image.
+
+        Floor Area distribution and Accommodation age codes distribution
+        comparison between original EPC data and Enriched Synthetic population.
+
+        :param df1: SPENSER dataset.
+        :type df1: pandas.DataFrame
+        :param df2: EPC dataset.
+        :type df2: pandas.DataFrame
+        :param lad_code: Local authority district code.
+        :type lad_code: string
+        """
+        if not (os.path.exists("data/output/")):
+            os.makedirs("data/output/")
+
+        fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(16, 10))
+
+        bins = list(range(1, 20, 1))
+        df2.FLOOR_AREA.plot(kind="hist", ax=ax[0][0], title="EPC", bins=bins)
+        df2.ACCOM_AGE.plot(kind="hist", ax=ax[1][0], title="EPC", bins=10)
+
+        df1.FLOOR_AREA.plot(
+            kind="hist", ax=ax[0][1], title="Enriched SPENSER", bins=bins
+        )
+        df1.ACCOM_AGE.plot(kind="hist", ax=ax[1][1], title="Enriched SPENSER", bins=10)
+
+        ax[0][0].xaxis.set_major_locator(MaxNLocator(integer=True))
+        ax[0][1].xaxis.set_major_locator(MaxNLocator(integer=True))
+
+        ax[0][0].set_xlabel("Floor area code")
+        ax[0][1].set_xlabel("Floor area code")
+
+        ax[1][0].set_xlabel("Accommodation age code")
+        ax[1][1].set_xlabel("Accommodation age code")
+
+        fig.tight_layout(pad=3.0)
+        fig_name = "_".join([lad_code, "validation.png"])
+        fig_name = os.path.join("data/output/", fig_name)
+        plt.savefig(fig_name)
+        plt.close()
+
+    def step(self, df0, df1, lad_code, psm_fig=True, validation_fig=True):
+        """Enriching population main step.
+
+        In this step the EPC data and the SPENSER data are combined to generate
+        an enriched synthetic population for a given local authority.
+
+        :param df0: SPENSER dataset.
+        :type df0: pandas.DataFrame
+        :param df1: EPC dataset.
+        :type df1: pandas.DataFrame
+        :param lad_code: Local authority district code.
+        :type lad_code: string
+        :param psm_fig: Boolean to save the propensity score distribution image, defaults to True.
+        :type psm_fig: bool, optional
+        :param validation_fig: Boolean to save the internal validation image, defaults to True.
+        :type validation_fig: bool, optional
+        :return: Enriched synthetic population
+        :rtype: pandas.DataFrame
+        """
+        df0, df1 = self.set_treatment(df0, df1)
+        dataset = pd.concat([df0, df1], ignore_index=True, sort=False)
+        dataset = self.set_area_factor(dataset)
+        dataset["ps"] = self.get_propensity_score(dataset, self.overlap_columns)
+
+        # TODO instead save the image every step, the image should be stored in
+        # a list. The list should be saved as zip file at the end.
+        if psm_fig:
+            self.save_psm_fig(dataset, lad_code)
+
+        # Separating EPC data from MSM data
+        df0 = dataset.loc[dataset.Treatment == 0].reset_index(drop=True)
+        df1 = dataset.loc[dataset.Treatment == 1].reset_index(drop=True)
+        del dataset
+
+        # Get neighbors and matched pairs
+        distances, indices = self.get_neighbors(df0, df1, self.n_neighbors)
+        pairs = self.get_matches(distances, indices, self.n_neighbors)
+        del distances, indices
+
+        # Get enriched population
+        rich_df = self.get_enriched_pop(pairs, df0, df1, self.matches_columns)
+
+        # TODO instead save the image every step, the image should be stored in
+        # a list. The list should be saved as zip file at the end.
+        if validation_fig:
+            self.save_validation_fig(rich_df, df1, lad_code)
+
+        return rich_df
diff --git a/01-ActivityModel/activity-model/config/epc_api.yaml b/01-ActivityModel/activity-model/config/epc_api.yaml
new file mode 100644
index 0000000..d3e572b
--- /dev/null
+++ b/01-ActivityModel/activity-model/config/epc_api.yaml
@@ -0,0 +1,25 @@
+################################################################################
+# Insert in this file the necessary information to download EPC data using API.
+#
+################################################################################
+
+#  EPC credentials
+epc_user: "user@email"
+epc_key: "user_key"
+
+epc_url: "https://epc.opendatacommunities.org/api/v1/domestic/search?"
+
+epc_years:
+  - 2008
+  - 2022
+
+epc_headers:
+  - "postcode"
+  - "property-type"
+  - "built-form"
+  - "construction-age-band"
+  - "tenure"
+  - "total-floor-area"
+  - "mains-gas-flag"
+  - "building-reference-number"
+  - "lodgement-datetime"
diff --git a/01-ActivityModel/activity-model/config/lad_codes.yaml b/01-ActivityModel/activity-model/config/lad_codes.yaml
new file mode 100644
index 0000000..6f7054a
--- /dev/null
+++ b/01-ActivityModel/activity-model/config/lad_codes.yaml
@@ -0,0 +1,8 @@
+################################################################################
+# Insert in this file the local authority codes (lad_codes) of your interest.
+# The default is Haringey.
+#
+################################################################################
+
+lad_codes:
+  - E09000014
\ No newline at end of file
diff --git a/01-ActivityModel/activity-model/config/lookups.yaml b/01-ActivityModel/activity-model/config/lookups.yaml
new file mode 100644
index 0000000..5c0e5eb
--- /dev/null
+++ b/01-ActivityModel/activity-model/config/lookups.yaml
@@ -0,0 +1,235 @@
+################################################################################
+# In this file is listed all lookups used by the Activity Model to organize the
+# EPC data to be merged with the Synthetic Population.
+#
+################################################################################
+# EPC data info:
+# - EPCs issued from January 2008 up to and including 30 June 2021.
+# - 21,440,172 Domestic EPCs
+# - Download: 18 October 2021
+# 
+# * Note that new EPCs may include keys not listed here!
+#
+################################################################################
+# EPC Headers:
+# Each file has 90 headers (keys). In this work, the used headers are:
+# - POSTCODE
+# - LOCAL_AUTHORITY
+# - PROPERTY_TYPE + BUILT_FORM = Accommodation type
+# - CONSTRUCTION_AGE_BAND
+# - TENURE    
+# - TOTAL_FLOOR_AREA  # m²
+# - NUMBER_HABITABLE_ROOMS
+#
+################################################################################
+# Dictionary lookups:
+# keys: based on the values present in the EPC
+# keys_values: based on the values present in the Synthetic Population
+#
+################################################################################
+# List lookups (accommodation age and floor area)
+# values: arbitrary chunks
+# 
+# List layout:
+# -
+#   - chunk desired code
+#   - chunk minimum value (not included)
+#   - chunk maximum value (included)
+################################################################################
+
+## Area lookup from: https://geoportal.statistics.gov.uk/
+area_url: "https://www.arcgis.com/sharing/rest/content/items/8a824519215947da99146692b0a0ff49/data"
+
+area_in_out:
+  - "pcds"
+  - "oa11cd"
+
+accommodation:
+  'House: Detached': 2
+  'Bungalow: Detached': 2
+  'House: Semi-Detached': 3
+  'Bungalow: Semi-Detached': 3
+  'House: Mid-Terrace': 4
+  'House: End-Terrace': 4
+  'House: Enclosed Mid-Terrace': 4
+  'House: Enclosed End-Terrace': 4
+  'Bungalow: Mid-Terrace': 4
+  'Bungalow: End-Terrace': 4
+  'Bungalow: Enclosed Mid-Terrace': 4
+  'Bungalow: Enclosed End-Terrace': 4
+  'Flat: NO DATA!': 5
+  'Flat: Detached': 5
+  'Flat: Semi-Detached': 5
+  'Flat: Mid-Terrace': 5
+  'Flat: End-Terrace': 5
+  'Flat: Enclosed Mid-Terrace': 5
+  'Flat: Enclosed End-Terrace': 5
+  'Maisonette: NO DATA!': 5
+  'Maisonette: Detached': 5
+  'Maisonette: Semi-Detached': 5
+  'Maisonette: Mid-Terrace': 5
+  'Maisonette: End-Terrace': 5
+  'Maisonette: Enclosed Mid-Terrace': 5
+  'Maisonette: Enclosed End-Terrace': 5
+  'Park home: Detached': 5
+  'Park home: Semi-Detached': 5
+  'House: NO DATA!': null
+  'Bungalow: NO DATA!': null
+
+age_categorical:
+  'England and Wales: before 1900': 1
+  'England and Wales: 1900-1929': 1
+  'England and Wales: 1930-1949': 1930
+  'England and Wales: 1950-1966': 1950
+  'England and Wales: 1967-1975': 1967
+  'England and Wales: 1976-1982': 1976
+  'England and Wales: 1983-1990': 1983
+  'England and Wales: 1991-1995': 1991
+  'England and Wales: 1996-2002': 1996
+  'England and Wales: 2003-2006': 2003
+  'England and Wales: 2007-2011': 2007
+  'England and Wales: 2007 onwards': 2007
+  'England and Wales: 2012 onwards': 2007
+  INVALID!: null
+  Not applicable: null
+  NO DATA!: null
+
+
+gas:
+  N: 0
+  Y: 1
+        
+tenure:
+  Owner-occupied: 1
+  owner-occupied: 1
+  rental (social): 5
+  Rented (social): 5
+  rental (private): 6
+  Rented (private): 6
+  Not defined - use in the case of a new dwelling for which the intended tenure in not known. It is no: null
+  unknown: null
+  NO DATA!: null
+
+age_numerical:
+  -
+    - 1
+    - 0
+    - 1929
+  -
+    - 2
+    - 1929
+    - 1949
+  -
+    - 3
+    - 1949
+    - 1966
+  -
+    - 4
+    - 1966
+    - 1975
+  -
+    - 5
+    - 1975
+    - 1982
+  -
+    - 6
+    - 1982
+    - 1990
+  -
+    - 7
+    - 1990
+    - 1995
+  -
+    - 8
+    - 1995
+    - 2002
+  -
+    - 9
+    - 2002
+    - 2006
+  -
+    - 10
+    - 2006
+    - 3000
+
+floor_area:
+  -
+    - 1
+    - 0
+    - 25 
+  -
+    - 2
+    - 25
+    - 50 
+  -
+    - 3
+    - 50
+    - 75 
+  -
+    - 4
+    - 75
+    - 100 
+  -
+    - 5
+    - 100
+    - 125 
+  -
+    - 6
+    - 125
+    - 150 
+  -
+    - 7
+    - 150
+    - 175 
+  -
+    - 8
+    - 175
+    - 200 
+  -
+    - 9
+    - 200
+    - 225 
+  -
+    - 10
+    - 225
+    - 250 
+  -
+    - 11
+    - 250
+    - 275 
+  -
+    - 12
+    - 275
+    - 300 
+  -
+    - 13
+    - 300
+    - 325 
+  -
+    - 14
+    - 325
+    - 350 
+  -
+    - 15
+    - 350
+    - 375 
+  -
+    - 16
+    - 375
+    - 400 
+  -
+    - 17
+    - 400
+    - 425 
+  -
+    - 18
+    - 425
+    - 450 
+  -
+    - 19
+    - 450
+    - 475 
+  -
+    - 20
+    - 475
+    - 500
diff --git a/01-ActivityModel/activity-model/config/psm.yaml b/01-ActivityModel/activity-model/config/psm.yaml
new file mode 100644
index 0000000..7520d71
--- /dev/null
+++ b/01-ActivityModel/activity-model/config/psm.yaml
@@ -0,0 +1,17 @@
+################################################################################
+# Insert in this file the necessary information to configure the 
+# EnrichingPopulation class.
+#
+################################################################################
+
+n_neighbors: 200
+
+overlap_columns:
+  - "LC4402_C_TYPACCOM"
+  - "tenure"
+  - "Area_factor"
+
+matches_columns:
+  - "FLOOR_AREA"
+  - "GAS"
+  - "ACCOM_AGE"
diff --git a/01-ActivityModel/activity-model/config/spenser.yaml b/01-ActivityModel/activity-model/config/spenser.yaml
new file mode 100644
index 0000000..a191483
--- /dev/null
+++ b/01-ActivityModel/activity-model/config/spenser.yaml
@@ -0,0 +1,6 @@
+################################################################################
+# Insert in this file the necessary information to download SPENSER data.
+#
+################################################################################
+
+spenser_url: "https://osf.io/623qz/download"
\ No newline at end of file
diff --git a/01-ActivityModel/activity-model/environment.yml b/01-ActivityModel/activity-model/environment.yml
new file mode 100644
index 0000000..7175bb4
--- /dev/null
+++ b/01-ActivityModel/activity-model/environment.yml
@@ -0,0 +1,21 @@
+name: energyflex1
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - matplotlib=3.5.1
+  - numpy=1.21.2
+  - pandas=1.4.1
+  - python=3.9.7
+  - pip=21.2.4
+  - pyyaml=6.0
+  - requests=2.27.1
+  - scikit-learn=1.0.2
+  - seaborn=0.11.2
+  - setuptools=58.0.4
+  - pytest=6.2.5
+  - black=19.10b0
+  - sphinx=4.4.0
+  - sphinx_rtd_theme=0.4.3
+  - pip:
+    - causalinference==0.1.3
diff --git a/01-ActivityModel/activity-model/setup.py b/01-ActivityModel/activity-model/setup.py
new file mode 100644
index 0000000..86456ec
--- /dev/null
+++ b/01-ActivityModel/activity-model/setup.py
@@ -0,0 +1,32 @@
+import setuptools
+
+with open("README.md", "r") as fh:
+    long_description = fh.read()
+
+setuptools.setup(
+    name="activity-model",
+    version="0.1.0",
+    author="Patricia Ternes",
+    author_email="p.ternesdallagnollo@leeds.ac.uk",
+    description="The Activity Model package",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    # url="#",
+    # install_requires=[
+    #     "numpy=1.21.2",
+    #     "pandas=1.4.1",
+    #     "requests=2.27.1",
+    #     "causalinference==0.1.3",
+    #     "scikit-learn=1.0.2",
+    #     "seaborn=0.11.2",
+    #     "matplotlib=3.5.1",
+    #     "pyyaml=6.0",
+    # ]
+    packages=setuptools.find_packages(),
+    classifiers=[
+        "Development Status :: 3 - Alpha",
+        "Programming Language :: Python :: 3.9",
+        "Intended Audience :: Science/Research",
+    ],
+    python_requires=">=3.9",
+)
diff --git a/01-ActivityModel/activity-model/tests/__init__.py b/01-ActivityModel/activity-model/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/01-ActivityModel/activity-model/tests/test_activity_model.py b/01-ActivityModel/activity-model/tests/test_activity_model.py
new file mode 100644
index 0000000..6d0a755
--- /dev/null
+++ b/01-ActivityModel/activity-model/tests/test_activity_model.py
@@ -0,0 +1,48 @@
+from activity_model import __version__
+from activity_model.data_preparation import Epc
+
+import requests
+import pytest
+
+
+def test_version():
+    assert __version__ == "0.1.0"
+
+
+@pytest.fixture
+def epc():
+    epc = Epc()
+    return epc
+
+
+def test_lookup_type(epc):
+    assert type(epc.accommodation_lookup) is dict
+    assert type(epc.age_categorical_lookup) is dict
+    assert type(epc.gas_lookup) is dict
+    assert type(epc.tenure_lookup) is dict
+    assert type(epc.age_numerical_lookup) is list
+    assert type(epc.floor_area_lookup) is list
+    assert type(epc.area_lookup) is dict
+
+
+def test_epc_connection(epc):
+    url = epc.epc_url
+    user = epc.epc_user
+    key = epc.epc_key
+    headers = {"Accept": "text/csv"}
+
+    r = requests.head(url, headers=headers, auth=(user, key))
+    assert (
+        r.status_code == 200
+    ), "Please check your EPC credentials here: config/epc_api.yaml"
+
+
+# test area lookup connection?
+# test spenser connection?
+
+# test if area column has the right values
+# test if floor area has the right values
+# test if age has the right values
+# test if gas has the right values
+# test if tenure has the right values
+# test if accommodation type has the right values

From d5946698941a86c06ac5444754648b94efadaf12 Mon Sep 17 00:00:00 2001
From: Patricia Ternes <p.ternesdallagnollo@leeds.ac.uk>
Date: Mon, 11 Apr 2022 14:47:59 +0100
Subject: [PATCH 2/2] fix path

---
 01-ActivityModel/activity-model/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/01-ActivityModel/activity-model/README.md b/01-ActivityModel/activity-model/README.md
index 5675d08..f6cea5b 100644
--- a/01-ActivityModel/activity-model/README.md
+++ b/01-ActivityModel/activity-model/README.md
@@ -29,7 +29,7 @@ machine:
 
 ```bash
 $ git clone https://github.com/anetobradley/energy_flex.git
-$ cd energy_flex/01-ActivityModel
+$ cd energy_flex/01-ActivityModel/activity-model/
 ```
 
 This package requires a specific