From bd10559bc998f7a163fd3d686c877b899d4657dc Mon Sep 17 00:00:00 2001 From: Seth Spielman Date: Fri, 20 Dec 2019 06:53:28 -0700 Subject: [PATCH 1/2] Reconcile multiple repos. --- code/data_prep.py | 133 ++++------- code/data_processing_functions.py | 258 +++++++++++++++++++++ code/varible_names_and_expected_contrib.py | 35 +++ 3 files changed, 338 insertions(+), 88 deletions(-) create mode 100644 code/data_processing_functions.py create mode 100644 code/varible_names_and_expected_contrib.py diff --git a/code/data_prep.py b/code/data_prep.py index 4701dd4..05e137d 100644 --- a/code/data_prep.py +++ b/code/data_prep.py @@ -1,98 +1,40 @@ -# !/usr/bin/env - -# This script loads and prepares Census and ACS data. -# Outputs a CSV file that can be used for the construction of SoVI -# Interpolates missing values using nearby values -# Calculates SE for compositve variables +""" +# Prepare Census and ACS data. +# Outputs a CSV file for the construction of SoVI +# Calculates SE for composite variables +""" import os import pandas as pd import pysal as ps import numpy as np -@TODO: Finish Doc strings -@TODO: Cleanup file - +from data_processing_functions import se_sum, se_ratio, se_prop, equal_test pd.set_option("chained_assignment", None) path = os.getcwd() -# path = os.path.dirname(os.getcwd()) # if running from the 'code' directory -outPath=os.path.join(path,'data') -ipath = os.path.join(path,'data','input') -spath = os.path.join(path,'data','spatial') -# -# functions fot the calculation of SE - - -def se_sum(*ses): - """ - compute the standard errors for a composite (sum) variable) - - :param ses: an arbitrary number of standard errors for variables participating in a sum (composite variable) - :return: Data frame containing standard errors for the composite variable - """ - - df_temp = pd.DataFrame(list(ses)) - df_temp = df_temp.T - df_temp = np.square(df_temp) - df_temp = df_temp.sum(1) - return np.sqrt(df_temp) - - -# SE of a ratio -def se_ratio(est, estd, sen, sed): - """ - - :param est: - :param estd: - :param sen: - :param sed: - :return: - """ - - sen2 = np.square(sen) - sed2 = np.square(sed) - est2 = np.square(est) - num = np.sqrt(sen2 + (est2 * sed2)) - return num / estd - - -# SE of a proprotion -def se_prop(est, estd, sen, sed): - sen2 = np.square(sen) - sed2 = np.square(sed) - est2 = np.square(est) - num = sen2 - (est2 * sed2) - num_alt = sen2 + (est2 * sed2) - problems = num <= 0 - num[problems] = num_alt[problems] - num = np.sqrt(num) - return num / estd - - -# unit test for equivalency between original and constructed variables -def equal_test(orig, alt): - if np.equal(orig, alt).sum() != db.shape[0]: - if (db.shape[0] - np.equal(orig, alt).sum()) \ - == np.isnan(orig).sum() == np.isnan(alt).sum(): - pass - else: - print("problem in equal test") - raise +outPath = os.path.join(path, 'data') +ipath = os.path.join(path, 'data', 'input') +spath = os.path.join(path, 'data', 'spatial') +""" +LOAD AND PREP DATA +Data used in this analysis was downloaded from social explorer, a fee based service that facilitates +download of census and acs data. + +@TODO: Description of each file loaded below needed +""" # define data types make_strings = {'Geo_FIPS': object, 'Geo_STATE': object, 'Geo_COUNTY': object, 'Geo_TRACT': object, 'Geo_CBSA': object, 'Geo_CSA': object} -# load data -# JOE: I had to change the encoding to latin-1 to avoid hitting a UTF-8 error acs = pd.read_csv(os.path.join(ipath, 'sovi_acs.csv'), - dtype=make_strings, skiprows=1,encoding='latin-1') + dtype=make_strings, skiprows=1, encoding='latin-1') census = pd.read_csv(os.path.join(ipath, 'sovi_decennial.csv'), - dtype=make_strings, skiprows=1,encoding='latin-1') + dtype=make_strings, skiprows=1, encoding='latin-1') acs_samp = pd.read_csv(os.path.join(ipath, 'sovi_acs_sampSize.csv'), - dtype=make_strings, skiprows=1,encoding='latin-1') + dtype=make_strings, skiprows=1, encoding='latin-1') # format FIPS acs.index = 'g' + acs.Geo_FIPS @@ -107,24 +49,24 @@ def equal_test(orig, alt): # if available add supplmentary data try: census_sup1 = pd.read_csv(os.path.join(ipath, 'sovi_decennial_sup1.csv'), - dtype=make_strings,skiprows=1,encoding='latin-1') + dtype=make_strings, skiprows=1, encoding='latin-1') census_sup1.index = 'g' + census_sup1.Geo_FIPS db = db.join(census_sup1, rsuffix='_decSup1') -except: +except FileNotFoundError: print("no supplementary decennial data") try: acs_sup1 = pd.read_csv(os.path.join(spath, 'sovi_acs_sup1.csv'), - dtype=make_strings,skiprows=1,encoding='latin-1') + dtype=make_strings, skiprows=1, encoding='latin-1') acs_sup1.index = 'g' + acs_sup1.Geo_FIPS db = db.join(acs_sup1, rsuffix='_acsSup1') -except: +except FileNotFoundError: print("did not pull supplementary ACS data - A") try: acs_sup2 = pd.read_csv(os.path.join(ipath, 'sovi_acs_kids.csv'), - dtype=make_strings, skiprows=1,encoding='latin-1') + dtype=make_strings, skiprows=1, encoding='latin-1') acs_sup2.index = 'g' + acs_sup2.Geo_FIPS db = db.join(acs_sup2, rsuffix='_acsSup2') -except: +except FileNotFoundError: print("did not pull supplementary ACS data - B") # drop Puerto Rico (sorry PR!) @@ -132,13 +74,15 @@ def equal_test(orig, alt): # define SE columns se_cols = [i for i in db.columns if i[-1] == 's' and i[0] == 'A'] -db[se_cols] *= (1.65 / 1.645) +db[se_cols] *= (1.65 / 1.645) # conversion rate for 90 pct conf interval -# calculate weights matrix -w = ps.queen_from_shapefile(os.path.join(spath, 'USA_Counties_500k.shp'), - idVariable='geoFIPS') -w.transform = 'R' +""" +COMPUTE SOVI INPUT DF +This the code below is fairly elaborate, this is necessary because many of the vairables used in the SOVI index +have to be derived from other variables. We compute variables from both the decennial census (2010) and the ACS +below. +""" # output dataframe db1 = pd.DataFrame(index=db.index) @@ -254,6 +198,16 @@ def equal_test(orig, alt): # I didn't understand QURBRURX db1['POPDENS'] = db.ACS12_5yr_B01003001 / (db.SE_T02A_002 * 1.) +""" +SPATIALLY IMPUTE MISSING VALUES + +For some counties homevalue is missing from the census so we impute it by taking the average of neighboring counties. +""" +# calculate weights matrix +w = ps.queen_from_shapefile(os.path.join(spath, 'USA_Counties_500k.shp'), + idVariable='geoFIPS') +w.transform = 'R' + # if no home value, assign the spatial lag of the estimate and SE homeval = db1['MHSEVAL_ALT'].copy() homeval_se = db.ACS12_5yr_B25077001s.copy() @@ -425,6 +379,9 @@ def equal_test(orig, alt): db1['sample_person'] = db.ACS12_5yr_B00001001 db1['sample_hu'] = db.ACS12_5yr_B00002001 +# writing index as variable because someone wrote bad code and this required. +db1['Geo_FIPS'] = db1.index.values + # Save data if main if __name__ == "__main__": db1.to_csv(os.path.join(path, 'sovi_inputs.csv')) diff --git a/code/data_processing_functions.py b/code/data_processing_functions.py new file mode 100644 index 0000000..839b9bc --- /dev/null +++ b/code/data_processing_functions.py @@ -0,0 +1,258 @@ +import pandas as pd +import numpy as np + +pd.set_option("chained_assignment", None) + +def se_sum(*ses): + """ + compute the standard errors for a composite (sum) variable) + + :param ses: an arbitrary number of standard errors for variables participating in a sum (composite variable) + :return: Data frame containing standard errors for the composite variable + """ + + df_temp = pd.DataFrame(list(ses)) + df_temp = df_temp.T + df_temp = np.square(df_temp) + df_temp = df_temp.sum(1) + return np.sqrt(df_temp) + + +# SE of a ratio +def se_ratio(est, estd, sen, sed): + """ + compute the standard error of a ratio + + :param est: a list of estimates (numerator in the ratio) + :param estd: a list of estimates (denominator in the ratio) + :param sen: standard error of the numerators + :param sed: standard error of the denominators + :return: standard error of a ratio + """ + + sen2 = np.square(sen) + sed2 = np.square(sed) + est2 = np.square(est) + num = np.sqrt(sen2 + (est2 * sed2)) + return num / estd + + +# SE of a proprotion +def se_prop(est, estd, sen, sed): + """ + Compute standard error of a proportion + + :param est: a list of estimates (numerator in the ratio) + :param estd: a list of estimates (denominator in the ratio) + :param sen: standard error of the numerators + :param sed: standard error of the denominators + :return: standard error of a proportion + """ + sen2 = np.square(sen) + sed2 = np.square(sed) + est2 = np.square(est) + num = sen2 - (est2 * sed2) + num_alt = sen2 + (est2 * sed2) + problems = num <= 0 + num[problems] = num_alt[problems] + num = np.sqrt(num) + return num / estd + + +# unit test for equivalency between original and constructed variables +def equal_test(orig, alt, db): + """ + When we construct composite variables we would like to ensue that the construction logic is sound. To do this we + use this small test, which takes an estimated constructed by us and the identical estimate, as published, and + compares them. We can only do this test for certain variables. But its helps us know that the logic used to + construct the input data set is sound + + :param orig: an estimate published by the bureau + :param alt: an estiamte constructed by us. + :param + :return: + """ + try: + assert (np.equal(orig, alt).sum() != db.shape[0]) and ((db.shape[0] - np.equal(orig, alt).sum()) == np.isnan(orig).sum() == np.isnan(alt).sum()), "values not equal" + except AssertionError: + print ("Equal Test Failure") + +def flip_signs(sovi_data, input_names): + """ + Flipped the signs of z-scored inputs such that they align with theory + + :param sovi_data: input csv files containing columns with names matching input names + :param input_names: a list of lists w/ variable names, expected contribution to the index, and additional info + :return: data frame with the same dims as the input but w/ signs flipped + """ + for name, sign, sample, hrname in input_names: + if name.isin(sovi_data.columns()): + if sign == 'neg': + sovi_data[name] = -sovi_data[name].values + elif sign == 'pos': + pass + return sovi_data + + +import pandas as pd +import pysal as ps +import numpy as np +from scipy.stats.mstats import zscore as ZSCORE +from scipy.stats import rankdata, spearmanr +from spss_pca import SPSS_PCA + + +# TODO: move into Data processing + +def dropAny(inputs, scores, drop=1, subset=None, netContrib=None, return_drop_rank=False): + """ + + :param inputs: the input variables USA_All + :param scores: scores, the SoVI outputs containing scores and ranks + :param drop: Number of places to drop, default 1 + :param subset: list of GEOIDs for subset (use for FEMA region or state) + :param netContrib: + :param return_drop_rank: + :return: + """ + # TODO: check input and scores data structure + # TODO: Break out subset logic, shoud not exist within this fuction. + if not subset is None: + scores = scores[ + scores[scores.columns[len(scores.columns) - 1]].astype('str').str.contains(subset)] # scores for region id + inputs = inputs[inputs.index.isin(scores[scores.columns[len(scores.columns) - 1]].index)] # inputs for subset + + # the data without no 1 + # TODO: Unclear which input is dropped, enforce sorting withing function + drop_no1 = inputs.drop(drop) + + # preserve GEOIDs as an index + # for computed SoVI + geoLevels = drop_no1.Geo_FIPS + + # Compute drop "number one" + pca = SPSS_PCA(drop_no1.drop(['Geo_FIPS', 'stateID'], axis=1, inplace=False), reduce=True, varimax=True) + sovi_actual = pca.scores_rot.sum(1) + sovi_actual = pd.DataFrame(sovi_actual, index=geoLevels, columns=['sovi']) + drop1p_score = sovi_actual.values # SoVI score + + if netContrib is None: # no net contrib var's specified, compute county ranks + + # preserve the original county ranks + # also for computing change in rank... + orig_rank = scores.drop(drop)['rank'] + + # add SoVI ranks for run + drop1p_rank = pd.Series([i[0] for i in sovi_actual.values], index=geoLevels).rank(ascending=False) + + obs_rchg_drop1 = pd.DataFrame({'orig_rank': orig_rank, 'drop1p_rank': drop1p_rank}, index=orig_rank.index) + obs_rchg_drop1 = obs_rchg_drop1.apply(lambda x: x.astype('int'), axis=1) # ensure all ints + obs_rchg_drop1['rank_chg'] = obs_rchg_drop1.orig_rank - obs_rchg_drop1.drop1p_rank + + return obs_rchg_drop1 + + else: # net contribution + + # variable rank using absolute value + rankContrib = abs(netContrib).apply(rankdata, axis=0, method='average') + rankContrib = (28 - rankContrib) + 1 + + # Construct table to hold the results of the drop one analysis + # Sort variable list based on importance rank. + if not subset: + varRanks = rankContrib['USA'].copy() # have to make a copy to sort index + varRanks.sort('USA') + else: + varRanks = rankContrib[subset].copy() # have to make a copy to sort index + varRanks.sort(subset) + + # recompute net contribution for drop no1 + Drop1_NetContrib = pd.Series(data=pca.weights_rot.sum(1), index=drop_no1.columns.drop(['Geo_FIPS', 'stateID'])) + Drop1_NetContrib = Drop1_NetContrib.transpose() + Drop1_NetContrib = Drop1_NetContrib.convert_objects(convert_numeric=True) + Drop1_NetContrib = Drop1_NetContrib.apply(lambda x: np.round(x, 2)) + Drop1_NetContrib = Drop1_NetContrib.rank(ascending=False) + + Drop1_NetContrib = Drop1_NetContrib[varRanks.index] # sort values by original index ranking + + nc_chg_drop1p = pd.DataFrame({'orig_rank': varRanks, 'drop1p_rank': Drop1_NetContrib}) + nc_chg_drop1p = nc_chg_drop1p.apply(lambda x: x.astype('int'), axis=1) # ensure all ints + nc_chg_drop1p['rank_chg'] = nc_chg_drop1p.orig_rank - nc_chg_drop1p.drop1p_rank + + return nc_chg_drop1p + + +def rankChgTable(inputs, scores, obs_names, subset=None, top=5, cor=False, drop=1, verbose=True): + dropany_result = dropAny(inputs=inputs, scores=scores, subset=subset, drop=drop) + + # ensure GEOID column in place + # assumes missing GEOID values stored in df index + if not 'geoFIPS' in dropany_result: + dropany_result['geoFIPS'] = dropany_result.index + + # merge dropany results with obs names + rctab = dropany_result.merge(obs_names, on='geoFIPS') + + # print the spearman rank correlation if specified + if cor: + spearcor = spearmanr(rctab.drop1p_rank, rctab.orig_rank) + if verbose: + print("Spearman Rank Correlation: " + str(np.round(spearcor[0], 5)), + "\np-value: " + str(np.round(spearcor[1], 4))) + print('\n') + + # dropped obs rank + drop_co = obs_names[obs_names.geoFIPS.str.contains(drop)] + drop_co['orig_rank'] = scores.ix[drop]['rank'] + + # assemble table for original ranks + orrk = rctab[rctab.orig_rank <= top].ix[:, ['geoFIPS', 'orig_rank', 'NAME']] + + if int(drop_co.orig_rank) <= top: # append dropped obs to table if top-ranked + orrk = drop_co.append(orrk) + + orrk = orrk.sort_values('orig_rank') + orrk['Top_Orig'] = orrk.NAME + " (" + orrk.orig_rank.astype('int').astype('str') + ")" + orrk.index = [i + 1 for i in range(0, top)] + orrk.ix[:, ['NAME', 'Top_Orig']] + + # assemble table for dropany ranks + d1rk = rctab[rctab.drop1p_rank <= top].ix[:, ['geoFIPS', 'drop1p_rank', 'orig_rank', 'NAME']].sort_values( + 'drop1p_rank') + d1rk['Top_dropany'] = d1rk.NAME + " (" + d1rk.orig_rank.astype('int').astype('str') + ")" + d1rk.index = [i + 1 for i in range(0, top)] + d1rk.ix[:, ['NAME', 'Top_dropany']] + + # return the tables combined + return pd.DataFrame({'All_Counties': orrk.Top_Orig, 'Drop_1': d1rk.Top_dropany}) + + +# wrap to a function +def dropCors(inputs, scores, subset=None): + cors = [] + + if subset is None: + geo_idx = scores.index.values + else: + geo_idx = scores[ + scores[scores.columns[len(scores.columns) - 1]].astype('str').str.contains(subset)].index.values + + for i in geo_idx: + drop_i = dropAny(inputs=inputs, scores=scores, subset=subset, drop=i) + cor = spearmanr(drop_i.drop1p_rank, drop_i.orig_rank) + cors.append(cor[0]) + + return pd.Series(cors, index=geo_idx) + + +## function for plotting rank quantile moves + +def rankQuantileMoves(inputs, scores, drop, subset=None, verbose=True): + da = dropAny(inputs=inputs, scores=scores, subset=subset, drop=drop) + if verbose: + print(ps.Quantiles(da.orig_rank)) # quantile breaks key + print('\n') + r0 = ps.Quantiles(da.orig_rank).yb + r1 = ps.Quantiles(da.drop1p_rank).yb + moves_raw = pd.DataFrame({'r0': r0, 'r1': r1}).groupby(['r0', 'r1']).size().unstack(fill_value=0) + return np.round(moves_raw.apply(lambda x: x / sum(x), axis=1), 2) diff --git a/code/varible_names_and_expected_contrib.py b/code/varible_names_and_expected_contrib.py new file mode 100644 index 0000000..063ad6e --- /dev/null +++ b/code/varible_names_and_expected_contrib.py @@ -0,0 +1,35 @@ +# attribute name and expected influence on vulnerability +class sovi_vars: + + def __init__(self): + input_names = [['MEDAGE_ACS', 'pos', 'person', 'Median Age'], + ['BLACK_ACS', 'pos', 'person', 'Pop African-American (%)'], + ['QNATAM_ACS', 'pos', 'person', 'Pop Native American (%)'], + ['QASIAN_ACS', 'pos', 'person', 'Pop Asian (%)'], + ['QHISP_ACS', 'pos', 'person', 'Pop Hispanic (%)'], + ['QAGEDEP_ACS', 'pos', 'person', 'Age Dependency (%)'], + ['QPUNIT_ACS', 'pos', 'person', 'Persons Per Housing Unit'], + ['PRENTER_ACS', 'pos', 'hu', 'Rental Housing (%)'], + ['QNRRES_ACS', 'pos', 'person', 'Nursing Home Residents (%)'], + ['QFEMALE_ACS', 'pos', 'person', 'Pop Female (%)'], + ['QFHH_ACS', 'pos', 'hu', 'Female-Headed Households (%)'], + ['QUNOCCHU_ACS', 'pos', 'hu', 'Vacant Housing (%)'], + ['PERCAP_ALT', 'neg', 'person', 'Per-Capita Income'], + ['QESL_ALT', 'pos', 'person', 'English as Second Language (%)'], + ['QCVLUN', 'pos', 'person', 'Unemployment (%)'], + ['QPOVTY', 'pos', 'person', 'Poverty (%)'], + ['QMOHO', 'pos', 'hu', 'Mobile Homes (%)'], + ['QED12LES_ALT', 'pos', 'person', + 'Adults Completed $200K (%)'], + ['MDGRENT_ALT', 'neg', 'hu', 'Median Rent'], + ['MHSEVAL_ALT', 'neg', 'hu', 'Median Home Value'], + ['POPDENS', 'pos', 'person', 'Population Density']] + From 69ec477b69643b6374348189002a048c5effe52a Mon Sep 17 00:00:00 2001 From: Seth Spielman Date: Fri, 20 Dec 2019 07:21:19 -0700 Subject: [PATCH 2/2] found a notebook!? --- code/create_FEMA_region_sovi_scores.ipynb | 1949 +++++++++++++++++++++ 1 file changed, 1949 insertions(+) create mode 100644 code/create_FEMA_region_sovi_scores.ipynb diff --git a/code/create_FEMA_region_sovi_scores.ipynb b/code/create_FEMA_region_sovi_scores.ipynb new file mode 100644 index 0000000..615193b --- /dev/null +++ b/code/create_FEMA_region_sovi_scores.ipynb @@ -0,0 +1,1949 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#The Validity of the SoVi Index\n", + "###Seth E. Spielman, David Folch, Joseph Tuccillo\n", + "\n", + "This is a script to calculate the Social Vulnerability (Sovi) Index at multiple geographic scales and usign different sets of input variables. The Sovi index assess a places social vulnerability to natural hazards and has been used in hundreds of publications, in both the academic and policy domains. We show that this index fails to meet certain intuitive assumptions about the nature of measurement of social vulnerability.\n", + "\n", + "The aim is to illustrate instability in the index and problems with *convergent validity*. Convergent Validity is the idea that multiple measurements of the same thing, using a valid instruments, should yield similar *absolute* and *relative* measurements. For example two thermometers measuing the same cup of water should yield the same approximate temperature- this would be an example of validitiy of absolute measurement. An less rigorous concept of validity is to consider three cups ordered from hottest (_**A**_) to coldest (_**C**_), the two thermometers would be valid in a *relative* sense if their measurements of cups A, B, C differed absolutely (they measured different temperatures for each cup) but still placed cup _**A**_ as the warmest and cup _**C**_ as the coldest.\n", + "\n", + "We will show in this analysis that the Sovi Index fails this \"cup test\" that is it often mixes up the orders of the cup. Counties in the United States that are high vulnerability at one scale of measurement (or form the index) are often low vulnerability in a slightly different version of the index.\n", + "\n", + "##Variables and Components\n", + "The Sovi Index is constructed using a tecnhnique called Principal Components Analysis, this is matrix decomposition method that uses the covariance matrix of the input data. Usually, in the social sciences one treats the \"compents\" what come of out a PCA as latent variables. For example, in Sovi it comon to fine components that measure things like \"race and class\". In this analysis we also show that this latent variable approach has maked some underlying problems with the Soci index, namely that variables contribute to the index in ways that are profoundly counter intuitive. \n", + "\n", + "##There is a paper\n", + "For an in-depth discussion of these ideas please see the companion paper to this anlysis [URL]() or contact the suthors. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##Data Prep\n", + "\n", + "In this section we read in data from the American Community Survey and the Decennial Census and processes the varaibles such that they correspond to the inputs used to commonly construct the Sovi Index, There is **a lot of data wrangling here**, combining variables into new variables, computing standard errors, etc. Its all farily straightforward and I will not talk through the details here." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "ename": "ImportError", + "evalue": "No module named mdp", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mscipy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstats\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mrankdata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mscipy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstats\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mspearmanr\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mspss_pca\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mSPSS_PCA\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0mlocal_path\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'/Users/sspielman'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Users/sspielman/Dropbox/SoVI_var_wise_paper/code/spss_pca.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mscipy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstats\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmstats\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mzscore\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mZSCORE\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mmdp\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mMDP\u001b[0m \u001b[0;31m# causing sklearn deprication warning\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0moperator\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mitemgetter\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mImportError\u001b[0m: No module named mdp" + ] + } + ], + "source": [ + "import os\n", + "import pandas as pd\n", + "import pysal as ps\n", + "import numpy as np\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "from scipy.stats.mstats import zscore as ZSCORE\n", + "from scipy.stats import rankdata\n", + "from scipy.stats import spearmanr \n", + "from spss_pca import SPSS_PCA\n", + "\n", + "local_path = '/Users/sspielman'\n", + "#local_path = '/Users/dfolch/'\n", + "\n", + "os.chdir(local_path+'Dropbox/SoVI_var_wise_paper/code')\n", + "path=local_path+'/Dropbox/SoVI_var_wise_paper'\n", + "outPath =local_path+'/Dropbox/SoVI_var_wise_paper/data'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "###Functions for calculating Standard Errors\n", + "\n", + "THe functions below are to calculate the standard error (SE) of various types of estimates from the American Cummunity Survey. These are provided by the US Census Bureau in the ACS Techncial Documentation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "#SE of a sum\n", + "def se_sum(*ses):\n", + " df_temp = pd.DataFrame(list(ses))\n", + " df_temp = df_temp.T\n", + " df_temp = np.square(df_temp)\n", + " df_temp = df_temp.sum(1)\n", + " return np.sqrt(df_temp)\n", + "\n", + "#SE of a ratio\n", + "def se_ratio(est, estd, sen, sed):\n", + " sen2 = np.square(sen)\n", + " sed2 = np.square(sed)\n", + " est2 = np.square(est)\n", + " num = np.sqrt(sen2 + (est2*sed2))\n", + " return num / estd\n", + "\n", + "#SE of a proprotion\n", + "def se_prop(est, estd, sen, sed):\n", + " sen2 = np.square(sen)\n", + " sed2 = np.square(sed)\n", + " est2 = np.square(est)\n", + " num = sen2 - (est2*sed2)\n", + " num_alt = sen2 + (est2*sed2)\n", + " problems = num <= 0\n", + " num[problems] = num_alt[problems]\n", + " num = np.sqrt(num)\n", + " return num / estd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Loading Input Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "#%%capture capt\n", + "pd.set_option(\"chained_assignment\", None)\n", + "\n", + "path=local_path+\"Dropbox/SoVI_var_wise_paper/data/input\"\n", + "spath=local_path+\"Dropbox/SoVI_var_wise_paper/data/spatial\"\n", + "\n", + "make_strings = {'Geo_FIPS':object, 'Geo_STATE':object, 'Geo_COUNTY':object, \n", + " 'Geo_TRACT':object, 'Geo_CBSA':object, 'Geo_CSA':object}\n", + "acs = pd.read_csv(os.path.join(path,'sovi_acs.csv'), dtype=make_strings, skiprows=1)\n", + "acs.index = 'g' + acs.Geo_FIPS\n", + "census = pd.read_csv(os.path.join(path,'sovi_decennial.csv'), dtype=make_strings, skiprows=1)\n", + "census.index = 'g' + census.Geo_FIPS\n", + "db = census\n", + "db = db.join(acs, rsuffix='_acs')\n", + "\n", + "acs_samp = pd.read_csv(os.path.join(path,'sovi_acs_sampSize.csv'), dtype=make_strings, skiprows=1)\n", + "acs_samp.index = 'g' + acs_samp.Geo_FIPS\n", + "db = db.join(acs_samp, rsuffix='_acsSamp')\n", + "\n", + "try:\n", + " census_sup1 = pd.read_csv(os.path.join(path,'sovi_decennial_sup1.csv'), dtype=make_strings, skiprows=1)\n", + " census_sup1.index = 'g' + census_sup1.Geo_FIPS\n", + " db = db.join(census_sup1, rsuffix='_decSup1')\n", + "except:\n", + " print 'no supplementary decennial data'\n", + "try:\n", + " acs_sup1 = pd.read_csv(os.path.join(spath,'sovi_acs_sup1.csv'), dtype=make_strings, skiprows=1)\n", + " acs_sup1.index = 'g' + acs_sup1.Geo_FIPS\n", + " db = db.join(acs_sup1, rsuffix='_acsSup1')\n", + "except:\n", + " print 'did not pull supplementary ACS data - A'\n", + "try:\n", + " acs_sup2 = pd.read_csv(os.path.join(path,'sovi_acs_kids.csv'), dtype=make_strings, skiprows=1)\n", + " acs_sup2.index = 'g' + acs_sup2.Geo_FIPS\n", + " db = db.join(acs_sup2, rsuffix='_acsSup2')\n", + "except:\n", + " print 'did not pull supplementary ACS data - B'\n", + " \n", + "# drop Puerto Rico (sorry PR!)\n", + "db = db[db.Geo_STATE != '72']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "###Preparing Sovi Inputs and Calculating SE\n", + "\n", + "A few counties are missing data so we create imput the values for these counties by taking the average value of their neighbors. This is done using the spatial weights matrix `w`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "#%%capture capt\n", + "\n", + "# cleanup Social Explorer standard errors\n", + "se_cols = [i for i in db.columns if i[-1]=='s' and i[0]=='A']\n", + "db[se_cols] *= (1.65/1.645)\n", + "\n", + "# weights matrix for computing spatial lags of missing data\n", + "w = ps.queen_from_shapefile(os.path.join(spath,'USA_Counties_500k.shp'), idVariable='geoFIPS')\n", + "w.transform = 'R'\n", + "\n", + "\n", + "# output dataframe\n", + "db1 = pd.DataFrame(index=db.index)\n", + "\n", + "# Decennial variables (original)\n", + "db1['MEDAGE'] = db.SF1_P0130001\n", + "db1['BLACK'] = (db.SF1_P0030003 * 1.) / db.SF1_P0010001\n", + "db1['QNATAM'] = (db.SF1_P0030004 * 1.) / db.SF1_P0010001\n", + "db1['QASIAN'] = (db.SF1_P0030005 * 1.) / db.SF1_P0010001\n", + "db1['QHISP'] = (db.SF1_P0040003 * 1.) / db.SF1_P0010001\n", + "db1['QAGEDEP'] =((db.SF1_P0120003 + db.SF1_P0120027 + db.SF1_P0120020 + \n", + " db.SF1_P0120021 + db.SF1_P0120022 + db.SF1_P0120023 + \n", + " db.SF1_P0120024 + db.SF1_P0120025 + db.SF1_P0120044 + \n", + " db.SF1_P0120045 + db.SF1_P0120046 + db.SF1_P0120047 + \n", + " db.SF1_P0120048 + db.SF1_P0120049) * 1.) / db.SF1_P0010001\n", + "db1['PPUNIT'] = db.SF1_H0100001 / (db.SF1_H0030002 * 1.)\n", + "db1['PRENTER'] = (db.SF1_H0040004 * 1.) / db.SF1_H0010001\n", + "db1['QNRRES'] = (db.SF1_P0420005 * 1.) / db.SF1_P0010001\n", + "db1['QFEMALE'] = (db.SF1_P0120026 * 1.) / db.SF1_P0010001\n", + "db1['QFHH'] = (db.SF1_P0190014 * 1.) / db.SF1_P0180001\n", + "db1['QUNOCCHU']=((db.SF1_H0010001 - db.SF1_H0030002) * 1.) / db.SF1_H0010001\n", + "\n", + "# Decennial variables (alternatives)\n", + "db1['BLACK_ALT'] = (db.SF1_P0050004 * 1.) / db.SF1_P0010001 # exclude hispanic\n", + "db1['QNATAM_ALT'] = (db.SF1_P0050005 * 1.) / db.SF1_P0010001 # exclude hispanic\n", + "db1['QASIAN_ALT'] = (db.SF1_P0050006 * 1.) / db.SF1_P0010001 # exclude hispanic\n", + "db1['QNRRES_ALT'] = (db.SF1_P0430023 + db.SF1_P0430054 * 1.) / db.SF1_P0010001 # 65 and over living in group quarters\n", + "db1['QUNOCCHU_ALT']= (db.SF1_H0030003 * 1.) / db.SF1_H0030001 # same value, simplified computation\n", + "\n", + "# Decennial variables (using ACS data and alternative formulations)\n", + "db1['MEDAGE_ACS'] = db.ACS12_5yr_B01002001\n", + "db1['BLACK_ACS'] = db.ACS12_5yr_B03002004 / (db.ACS12_5yr_B03002001 * 1.)\n", + "db1['QNATAM_ACS'] = db.ACS12_5yr_B03002005 / (db.ACS12_5yr_B03002001 * 1.)\n", + "db1['QASIAN_ACS'] = db.ACS12_5yr_B03002006 / (db.ACS12_5yr_B03002001 * 1.)\n", + "db1['QHISP_ACS'] = db.ACS12_5yr_B03002012 / (db.ACS12_5yr_B03002001 * 1.)\n", + "db1['QAGEDEP_ACS'] =(db.ACS12_5yr_B06001002 + db.ACS12_5yr_B09020001) / (db.ACS12_5yr_B01003001 * 1.)\n", + "db1['QPUNIT_ACS'] = db.ACS12_5yr_B25008001 / (db.ACS12_5yr_B25002002 * 1.)\n", + "db1['PRENTER_ACS'] = db.ACS12_5yr_B25003003 / (db.ACS12_5yr_B25002001 * 1.)\n", + "db1['QNRRES_ACS'] = db.ACS12_5yr_B09020021 / (db.ACS12_5yr_B01003001 * 1.)\n", + "db1['QFEMALE_ACS'] = db.ACS12_5yr_B01001026 / (db.ACS12_5yr_B01003001 * 1.) \n", + "db1['QFHH_ACS'] = db.ACS12_5yr_B11001006 / (db.ACS12_5yr_B11001001 * 1.)\n", + "db1['QUNOCCHU_ACS']= db.ACS12_5yr_B25002003 / (db.ACS12_5yr_B25002001 * 1.)\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "#############################\n", + "\n", + "# ACS variables (original)\n", + "db1['PERCAP'] = db.ACS12_5yr_B19025001 / (db.ACS12_5yr_B01003001 * 1.)\n", + "db1['QESL'] =((db.ACS12_5yr_B16004029 + db.ACS12_5yr_B16004030 +\n", + " db.ACS12_5yr_B16004034 + db.ACS12_5yr_B16004035 +\n", + " db.ACS12_5yr_B16004039 + db.ACS12_5yr_B16004040 +\n", + " db.ACS12_5yr_B16004044 + db.ACS12_5yr_B16004045 +\n", + " db.ACS12_5yr_B16004051 + db.ACS12_5yr_B16004052 +\n", + " db.ACS12_5yr_B16004056 + db.ACS12_5yr_B16004057 +\n", + " db.ACS12_5yr_B16004061 + db.ACS12_5yr_B16004062 +\n", + " db.ACS12_5yr_B16004066 + db.ACS12_5yr_B16004067) * 1.) /\\\n", + " ((db.ACS12_5yr_B16004024 + db.ACS12_5yr_B16004046) - \n", + " (db.ACS12_5yr_B16004025 + db.ACS12_5yr_B16004047))\n", + "db1.QESL = db1.QESL.replace([np.inf,-np.inf, np.nan], 0)\n", + "db1.QESL = db1.QESL.replace([np.inf,-np.inf], 0)\n", + "db1['QCVLUN'] =((db.ACS12_5yr_B23022025 + db.ACS12_5yr_B23022049) * 1.) /\\\n", + " db.ACS12_5yr_B23022001\n", + "db1['QPOVTY'] = (db.ACS12_5yr_B17021002 * 1.) / db.ACS12_5yr_B17021001\n", + "db1['QMOHO'] = (db.ACS12_5yr_B25024010 * 1.) / db.ACS12_5yr_B25024001\n", + "db1['QED12LES'] =((db.ACS12_5yr_B15002003 + db.ACS12_5yr_B15002004 +\n", + " db.ACS12_5yr_B15002005 + db.ACS12_5yr_B15002006 +\n", + " db.ACS12_5yr_B15002007 + db.ACS12_5yr_B15002008 +\n", + " db.ACS12_5yr_B15002009 + db.ACS12_5yr_B15002010 +\n", + " db.ACS12_5yr_B15002020 + db.ACS12_5yr_B15002021 +\n", + " db.ACS12_5yr_B15002022 + db.ACS12_5yr_B15002023 +\n", + " db.ACS12_5yr_B15002024 + db.ACS12_5yr_B15002025 +\n", + " db.ACS12_5yr_B15002026 + db.ACS12_5yr_B15002027) * 1.) /\\\n", + " db.ACS12_5yr_B15002001\n", + "db1['QFEMLBR'] = (db.ACS12_5yr_C24010038 * 1.) / db.ACS12_5yr_C24010001\n", + "db1['QEXTRCT'] =((db.ACS12_5yr_C24030003 + db.ACS12_5yr_C24030030) * 1.) /\\\n", + " db.ACS12_5yr_C24030001\n", + "db1['QSERV'] =((db.ACS12_5yr_C24010019 + db.ACS12_5yr_C24010055) * 1.) /\\\n", + " db.ACS12_5yr_C24010001\n", + "db1['QSSBEN'] = (db.ACS12_5yr_B19055002 * 1.) / db.ACS12_5yr_B19055001\n", + "db1['QNOAUTO'] =((db.ACS12_5yr_B25044003 + db.ACS12_5yr_B25044010) * 1.) /\\\n", + " db.ACS12_5yr_B25044001\n", + "db1['QFAM'] = (db.ACS12_5yr_B09002002 * 1.) / db.ACS12_5yr_B09002001\n", + "db1.QFAM = db1.QFAM.replace([np.inf,-np.inf, np.nan], 0)\n", + "db1['QRICH200K']= (db.ACS12_5yr_B19001017 * 1.) / db.ACS12_5yr_B11001001\n", + "\n", + "# ACS variables (alternatives)\n", + "db1['PERCAP_ALT'] = db.ACS12_5yr_B19025001 / (db.ACS12_5yr_B25008001 * 1.) # HH income divided by persons in HHs\n", + "db1['QESL_ALT'] =((db.ACS12_5yr_B06007005 + db.ACS12_5yr_B06007008) * 1.) /\\\n", + " db.ACS12_5yr_B06007001 # 5 and older who don't speak English very well\n", + "db1['QED12LES_ALT'] = (db.ACS12_5yr_B16010002 * 1.) / db.ACS12_5yr_B16010001 # same value, simplified computation\n", + "db1['QEXTRCT_ALT'] = (db.ACS12_5yr_C24050002 * 1.) / db.ACS12_5yr_C24050001 # same value, simplified computation\n", + "db1['QSERV_ALT'] = (db.ACS12_5yr_C24050029 * 1.) / db.ACS12_5yr_C24050001 # same value, simplified computation\n", + "db1['QNOAUTO_ALT'] = (db.ACS12_5yr_B08201002 * 1.) / db.ACS12_5yr_B08201001 # same value, simplified computation\n", + "db1['MDGRENT_ALT'] = db.ACS12_5yr_B25064001 # the original computed the median by hand so is not included\n", + "db1['MHSEVAL_ALT'] = db.ACS12_5yr_B25077001 # the original computed the median by hand so is not included\n", + "db1['POPDENS'] = db.ACS12_5yr_B01003001 / (db.SE_T02A_002 * 1.) # I didn't understand QURBRURX\n", + "\n", + "# if no home value, assign the spatial lag of the estimate and SE\n", + "homeval = db1['MHSEVAL_ALT'].copy()\n", + "homeval_se = db.ACS12_5yr_B25077001s.copy()\n", + "dbf = ps.open(os.path.join(spath,'USA_Counties_500k.dbf'))\n", + "\n", + "#Rename dbf GEOIDs to match homeval\n", + "geoid=dbf.by_col('geoFIPS')\n", + "\n", + "shp_fips = pd.DataFrame(dbf.by_col('geoFIPS'), index=geoid)\n", + "shp_fips = shp_fips.join(homeval)\n", + "shp_fips = shp_fips.join(homeval_se)\n", + "shp_fips['MHSEVAL_ALT_LAG'] = ps.lag_spatial(w, shp_fips.MHSEVAL_ALT)\n", + "shp_fips['MHSEVAL_ALT_LAG_SE'] = ps.lag_spatial(w, shp_fips.ACS12_5yr_B25077001s)\n", + "\n", + "##Assign MHSEVAL_ALT and MHSEVAL_ALT_SE values back to neighborless counties \n", + "#This might just act as a temporary fix... \n", + "\n", + "#Get original MHSEVAL_ALT values for problem counties as list\n", + "mh=shp_fips.ix[shp_fips.MHSEVAL_ALT_LAG==0].MHSEVAL_ALT.tolist()\n", + "\n", + "#Reassign values to MHSEVAL_ALT_LAG\n", + "shp_fips.ix[shp_fips.MHSEVAL_ALT_LAG==0,'MHSEVAL_ALT_LAG']=mh\n", + "\n", + "#Reassign missing standard error values\n", + "mhs=shp_fips.ix[shp_fips.MHSEVAL_ALT_LAG_SE==0].ACS12_5yr_B25077001s.tolist()\n", + "shp_fips.ix[shp_fips.MHSEVAL_ALT_LAG_SE==0,'MHSEVAL_ALT_LAG_SE']=mhs\n", + "\n", + "#Get rid of nan values - reassign MHSEVAL_ALT(_SE)\n", + "shp_fips.MHSEVAL_ALT_LAG[np.isnan(shp_fips.MHSEVAL_ALT_LAG)] = shp_fips.MHSEVAL_ALT[np.isnan(shp_fips.MHSEVAL_ALT_LAG)] # replace NA with lag \n", + "shp_fips.MHSEVAL_ALT_LAG_SE[np.isnan(shp_fips.MHSEVAL_ALT_LAG_SE)] = shp_fips.ACS12_5yr_B25077001s[np.isnan(shp_fips.MHSEVAL_ALT_LAG_SE)] # replace NA with lag \n", + "\n", + "\n", + "db1['MHSEVAL_ALT_LAG'] = shp_fips['MHSEVAL_ALT_LAG']\n", + "db1['MHSEVAL_ALT_LAG_SE'] = shp_fips['MHSEVAL_ALT_LAG_SE']\n", + "db1.MHSEVAL_ALT[np.isnan(db1.MHSEVAL_ALT)] = db1.MHSEVAL_ALT_LAG[np.isnan(db1.MHSEVAL_ALT)] \n", + "# note: the lagged SE is pushed to the final column in the SE section below\n", + "\n", + "#############################\n", + "\n", + "# Decennial standard errors (using ACS data and alternative formulations)\n", + "db1['MEDAGE_ACS_SE'] = db.ACS12_5yr_B01002001s\n", + "\n", + "db1['BLACK_ACS_SE'] = se_prop(db1.BLACK_ACS, db.ACS12_5yr_B03002001,\n", + " db.ACS12_5yr_B03002004s, db.ACS12_5yr_B03002001s)\n", + "db1['QNATAM_ACS_SE'] = se_prop(db1.QNATAM_ACS, db.ACS12_5yr_B03002001,\n", + " db.ACS12_5yr_B03002005s, db.ACS12_5yr_B03002001s)\n", + "db1['QASIAN_ACS_SE'] = se_prop(db1.QASIAN_ACS, db.ACS12_5yr_B03002001,\n", + " db.ACS12_5yr_B03002006s, db.ACS12_5yr_B03002001s)\n", + "db1['QHISP_ACS_SE'] = se_prop(db1.QHISP_ACS, db.ACS12_5yr_B03002001,\n", + " db.ACS12_5yr_B03002012s, db.ACS12_5yr_B03002001s)\n", + "\n", + "QAGEDEP_ACS_sen = se_sum(db.ACS12_5yr_B06001002s, db.ACS12_5yr_B09020001s)\n", + "db1['QAGEDEP_ACS_SE'] = se_prop(db1.QAGEDEP_ACS, db.ACS12_5yr_B01003001,\n", + " QAGEDEP_ACS_sen, db.ACS12_5yr_B01003001s)\n", + "\n", + "db1['QPUNIT_ACS_SE'] =se_ratio(db1.QPUNIT_ACS, db.ACS12_5yr_B25002002,\n", + " db.ACS12_5yr_B25008001s, db.ACS12_5yr_B25002002s)\n", + "db1['PRENTER_ACS_SE'] = se_prop(db1.PRENTER_ACS, db.ACS12_5yr_B25002001,\n", + " db.ACS12_5yr_B25003003s, db.ACS12_5yr_B25002001s)\n", + "db1['QNRRES_ACS_SE'] = se_prop(db1.QNRRES_ACS, db.ACS12_5yr_B01003001,\n", + " db.ACS12_5yr_B09020021s, db.ACS12_5yr_B01003001s)\n", + "db1['QFEMALE_ACS_SE'] = se_prop(db1.QFEMALE_ACS, db.ACS12_5yr_B01003001,\n", + " db.ACS12_5yr_B01001026s, db.ACS12_5yr_B01003001s) \n", + "db1['QFHH_ACS_SE'] = se_prop(db1.QFHH_ACS, db.ACS12_5yr_B11001001,\n", + " db.ACS12_5yr_B11001006s, db.ACS12_5yr_B11001001s)\n", + "db1['QUNOCCHU_ACS_SE']= se_prop(db1.QUNOCCHU_ACS, db.ACS12_5yr_B25002001,\n", + " db.ACS12_5yr_B25002003s, db.ACS12_5yr_B25002001s)\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "#############################\n", + "\n", + "# ACS standard errors (original)\n", + "db1['PERCAP_SE'] = se_ratio(db1.PERCAP, db.ACS12_5yr_B01003001,\n", + " db.ACS12_5yr_B19025001s, db.ACS12_5yr_B01003001s)\n", + "\n", + "QESL_sen = se_sum(db.ACS12_5yr_B16004029s, db.ACS12_5yr_B16004030s,\n", + " db.ACS12_5yr_B16004034s, db.ACS12_5yr_B16004035s,\n", + " db.ACS12_5yr_B16004039s, db.ACS12_5yr_B16004040s,\n", + " db.ACS12_5yr_B16004044s, db.ACS12_5yr_B16004045s,\n", + " db.ACS12_5yr_B16004051s, db.ACS12_5yr_B16004052s,\n", + " db.ACS12_5yr_B16004056s, db.ACS12_5yr_B16004057s,\n", + " db.ACS12_5yr_B16004061s, db.ACS12_5yr_B16004062s,\n", + " db.ACS12_5yr_B16004066s, db.ACS12_5yr_B16004067s)\n", + "QESL_sed = se_sum(db.ACS12_5yr_B16004024s, db.ACS12_5yr_B16004046s,\n", + " db.ACS12_5yr_B16004025s, db.ACS12_5yr_B16004047s)\n", + "db1['QESL_SE'] = se_prop(db1.QESL, (db.ACS12_5yr_B16004024 + db.ACS12_5yr_B16004046) - \n", + " (db.ACS12_5yr_B16004025 + db.ACS12_5yr_B16004047),\n", + " QESL_sen, QESL_sed)\n", + "db1.QESL_SE = db1.QESL_SE.replace([np.inf,-np.inf], 0)\n", + "db1.QESL_SE[db1.QESL==0] = 0\n", + "\n", + "QCVLUN_sen = se_sum(db.ACS12_5yr_B23022025s, db.ACS12_5yr_B23022049s)\n", + "db1['QCVLUN_SE'] = se_prop(db1.QCVLUN, db.ACS12_5yr_B23022001, \n", + " QCVLUN_sen, db.ACS12_5yr_B23022001s)\n", + "\n", + "db1['QPOVTY_SE'] = se_prop(db1.QPOVTY, db.ACS12_5yr_B17021001,\n", + " db.ACS12_5yr_B17021002s, db.ACS12_5yr_B17021001s)\n", + "db1['QMOHO_SE'] = se_prop(db1.QMOHO, db.ACS12_5yr_B25024001,\n", + " db.ACS12_5yr_B25024010s, db.ACS12_5yr_B25024001s)\n", + "\n", + "QED12LES_sen = se_sum(db.ACS12_5yr_B15002003s, db.ACS12_5yr_B15002004s,\n", + " db.ACS12_5yr_B15002005s, db.ACS12_5yr_B15002006s,\n", + " db.ACS12_5yr_B15002007s, db.ACS12_5yr_B15002008s,\n", + " db.ACS12_5yr_B15002009s, db.ACS12_5yr_B15002010s,\n", + " db.ACS12_5yr_B15002020s, db.ACS12_5yr_B15002021s,\n", + " db.ACS12_5yr_B15002022s, db.ACS12_5yr_B15002023s,\n", + " db.ACS12_5yr_B15002024s, db.ACS12_5yr_B15002025s,\n", + " db.ACS12_5yr_B15002026s, db.ACS12_5yr_B15002027s)\n", + "db1['QED12LES_SE'] = se_prop(db1.QED12LES, db.ACS12_5yr_B15002001,\n", + " QED12LES_sen, db.ACS12_5yr_B15002001s)\n", + "\n", + "db1['QFEMLBR_SE'] = se_prop(db1.QFEMLBR, db.ACS12_5yr_C24010001,\n", + " db.ACS12_5yr_C24010038s, db.ACS12_5yr_C24010001s)\n", + "\n", + "\n", + "QEXTRCT_sen = se_sum(db.ACS12_5yr_C24030003s, db.ACS12_5yr_C24030030s)\n", + "db1['QEXTRCT_SE'] = se_prop(db1.QEXTRCT, db.ACS12_5yr_C24030001,\n", + " QEXTRCT_sen, db.ACS12_5yr_C24030001s)\n", + "\n", + "QSERV_sen = se_sum(db.ACS12_5yr_C24010019s, db.ACS12_5yr_C24010055s)\n", + "db1['QSERV_SE'] = se_prop(db1.QSERV, db.ACS12_5yr_C24010001,\n", + " QSERV_sen, db.ACS12_5yr_C24010001s)\n", + "\n", + "db1['QSSBEN_SE'] = se_prop(db1.QSSBEN, db.ACS12_5yr_B19055001, \n", + " db.ACS12_5yr_B19055002s, db.ACS12_5yr_B19055001s)\n", + "\n", + "QNOAUTO_sen = se_sum(db.ACS12_5yr_B25044003s, db.ACS12_5yr_B25044010s)\n", + "db1['QNOAUTO_SE'] = se_prop(db1.QNOAUTO, db.ACS12_5yr_B25044001,\n", + " QNOAUTO_sen, db.ACS12_5yr_B25044001s)\n", + "\n", + "db1['QFAM_SE'] = se_prop(db1.QFAM, db.ACS12_5yr_B09002001, \n", + " db.ACS12_5yr_B09002002s, db.ACS12_5yr_B09002001s)\n", + "db1.QFAM_SE = db1.QFAM_SE.replace([np.inf,-np.inf], 0)\n", + "\n", + "db1['QRICH200K_SE']= se_prop(db1.QRICH200K, db.ACS12_5yr_B11001001, \n", + " db.ACS12_5yr_B19001017s, db.ACS12_5yr_B11001001s)\n", + "\n", + "\n", + "#############################\n", + "\n", + "# ACS standard errors (alternatives)\n", + "db1['PERCAP_ALT_SE'] = se_ratio(db1.PERCAP_ALT, db.ACS12_5yr_B25008001,\n", + " db.ACS12_5yr_B19025001s, db.ACS12_5yr_B25008001s)\n", + "\n", + "QESL_ALT_sen = se_sum(db.ACS12_5yr_B06007005s, db.ACS12_5yr_B06007008s)\n", + "db1['QESL_ALT_SE'] = se_prop(db1.QESL_ALT, db.ACS12_5yr_B06007001,\n", + " QESL_ALT_sen, db.ACS12_5yr_B06007001s)\n", + "\n", + "db1['QED12LES_ALT_SE'] = se_prop(db1.QED12LES_ALT, db.ACS12_5yr_B16010001,\n", + " db.ACS12_5yr_B16010002s, db.ACS12_5yr_B16010001s)\n", + "db1['QEXTRCT_ALT_SE'] = se_prop(db1.QEXTRCT_ALT, db.ACS12_5yr_C24050001,\n", + " db.ACS12_5yr_C24050002s, db.ACS12_5yr_C24050001s)\n", + "db1['QSERV_ALT_SE'] = se_prop(db1.QSERV_ALT, db.ACS12_5yr_C24050001,\n", + " db.ACS12_5yr_C24050029s, db.ACS12_5yr_C24050001s)\n", + "db1['QNOAUTO_ALT_SE'] = se_prop(db1.QNOAUTO_ALT, db.ACS12_5yr_B08201001,\n", + " db.ACS12_5yr_B08201002s, db.ACS12_5yr_B08201001s)\n", + "db1['MDGRENT_ALT_SE'] = db.ACS12_5yr_B25064001s\n", + "db1['MHSEVAL_ALT_SE'] = db.ACS12_5yr_B25077001s\n", + "db1.MHSEVAL_ALT_SE[np.isnan(db1.MHSEVAL_ALT)] = db1.MHSEVAL_ALT_LAG_SE[np.isnan(db1.MHSEVAL_ALT)] # replace NA with lag \n", + "db1.MHSEVAL_ALT_SE[np.isnan(db1.MHSEVAL_ALT_SE)] = db1.MHSEVAL_ALT_LAG_SE[np.isnan(db1.MHSEVAL_ALT_SE)] # replace NA with lag \n", + "db1['POPDENS_SE'] =se_ratio(db1.POPDENS, db.SE_T02A_002,\n", + " db.ACS12_5yr_B01003001s, 0) # these are nearly all zero since county pops tend to have 0 MOE\n", + "\n", + "#############################\n", + "\n", + "# Tests to validate equivalency \n", + "def equal_test(orig, alt):\n", + " if np.equal(orig, alt).sum() != db.shape[0]:\n", + " if (db.shape[0] - np.equal(orig, alt).sum()) == np.isnan(orig).sum() == np.isnan(alt).sum():\n", + " pass\n", + " else:\n", + " raise Exception, 'problem'\n", + "equal_test(db1.QUNOCCHU, db1.QUNOCCHU_ALT)\n", + "equal_test(db1.QED12LES, db1.QED12LES_ALT)\n", + "equal_test(db1.QEXTRCT, db1.QEXTRCT_ALT)\n", + "equal_test(db1.QSERV, db1.QSERV_ALT) \n", + "equal_test(db1.QNOAUTO, db1.QNOAUTO_ALT) \n", + "\n", + "\n", + "\n", + "#############################\n", + "\n", + "# Add in the sample sizes\n", + "db1['sample_person'] = db.ACS12_5yr_B00001001\n", + "db1['sample_hu'] = db.ACS12_5yr_B00002001\n", + "\n", + "#############################\n", + "\n", + "#the final data frame is written to disk\n", + "db1.to_csv(os.path.join(path,'sovi_inputs.csv'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "###Analysis\n", + "\n", + "Here we load the data `db1` or alternative add a refernce to db1 with a new name `US_ALL`. We describe each of the input variables with a human readable name and outline their expected contributions to vulnerability. For example, the variable `MEDAGE_ACS`, which measures the median age in the county is expected to have a positive (\"pos\") contribution to social vulnerability - the logic is that older populations are more vulnerable to disasters. Similarly the variable `QRICH200K`, which measures the portion of hosuing units making over $200k/year is expected to have a negative (\"neg\") contribution to social vulnerability. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "#Import Data\n", + "US_All = db1.copy()\n", + "US_All['Geo_FIPS'] = US_All.index.values\n", + "#US_All = pd.read_csv(path+'/data/input/sovi_inputs.csv')\n", + "#US_All.index = US_All.Geo_FIPS\n", + "\n", + "\n", + "# attribute name and expected influence on vulnerability\n", + "input_names = [['MEDAGE_ACS','pos','person','Median Age'],\n", + " ['BLACK_ACS','pos','person','Pop African-American (%)'],\n", + " ['QNATAM_ACS','pos','person','Pop Native American (%)'],\n", + " ['QASIAN_ACS','pos','person','Pop Asian (%)'],\n", + " ['QHISP_ACS','pos','person','Pop Hispanic (%)'],\n", + " ['QAGEDEP_ACS','pos','person','Age Dependency (%)'],\n", + " ['QPUNIT_ACS','pos','person','Persons Per Housing Unit'],\n", + " ['PRENTER_ACS','pos','hu','Rental Housing (%)'],\n", + " ['QNRRES_ACS','pos','person','Nursing Home Residents (%)'],\n", + " ['QFEMALE_ACS','pos','person','Pop Female (%)'],\n", + " ['QFHH_ACS','pos','hu','Female-Headed Households (%)'],\n", + " ['QUNOCCHU_ACS','pos','hu','Vacant Housing (%)'],\n", + " ['PERCAP_ALT','neg','person','Per-Capita Income'],\n", + " ['QESL_ALT','pos','person','English as Second Language (%)'],\n", + " ['QCVLUN','pos','person','Unemployment (%)'],\n", + " ['QPOVTY','pos','person','Poverty (%)'],\n", + " ['QMOHO','pos','hu','Mobile Homes (%)'],\n", + " ['QED12LES_ALT','pos','person','Adults Completed $200K (%)'],\n", + " ['MDGRENT_ALT','neg','hu','Median Rent'],\n", + " ['MHSEVAL_ALT','neg','hu','Median Home Value'],\n", + " ['POPDENS','pos','person','Population Density']] \n", + "\n", + "#Get attribute names\n", + "attr_names=[j[0] for j in input_names]\n", + "#cols = [c for c in US_All.columns if c.find('_SE') == -1]\n", + "\n", + "attr_names.append('Geo_FIPS')\n", + "#US_All = US_All.dropna(axis=0) #two counties misisng data in state 15 and 48\n", + "US_All = US_All[attr_names]\n", + "US_All['stateID'] = US_All.Geo_FIPS.str.slice(0,3,1)\n", + "attr_names.remove('Geo_FIPS')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "####Flipping Signs\n", + "\n", + "To ensure that each variable contributes as expected to the final Sovi Index following Eric Tate (2012?) we flip the signs of the input data. Variables where a high value are expected to contribute negatively to Social vulnerability have their signs flipped, such that positive values become negative. This has the effect of reversing the sign of the scores used to compute the index. Consider the variable measuring the percent of households making over \\$200,000 per year, if the mean at the county-level in a state was 5\\% a county two SD above the mean, say one where 10\\% of the population made mroe than $200K, that would have a positive z-score (+2). However, this high prevalance of wealthy people should reduce the vulnerability, by flipping the signs we ensure that that mean is now -5\\% and a value of -10\\% is two standard deviations *below* the mean. Thus when multiplied by its (positive) loading a county whit high wealth will have a lower social vulnerability index." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# input data prep\n", + "# --swap signs of the attributes expected to have a \"negative\" affect on vulnerability\n", + "for name, sign, sample, hrname in input_names:\n", + " if sign == 'neg':\n", + " US_All[name] = -US_All[name].values\n", + " elif sign == 'pos':\n", + " pass\n", + " else:\n", + " raise Exception, \"problem\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Build FEMA Region, State, and National Files. \n", + "## Create data frames to hold analysis results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this block of code we do two things. First, we create subsets of the national file for each FEMA region. Second, we create pandas data frames to hold the results of the drop 1 and analysis and each of the " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "########################################\n", + "##SOVI FOR FEMA REGIONS\n", + "#########################################\n", + "#Build FEMA subRegions Dict values= state ID's\n", + "FEMA_subs= {'FEMA_1':['g23','g50','g33','g25','g09','g44']}\n", + "FEMA_subs['FEMA_2'] = ['g36','g34']\n", + "FEMA_subs['FEMA_3'] = ['g42','g10','g11','g24','g51','g54']\n", + "FEMA_subs['FEMA_4'] = ['g21','g47','g37','g28','g01','g13','g45','g12']\n", + "FEMA_subs['FEMA_5'] = ['g27','g55','g26','g17','g18','g39']\n", + "FEMA_subs['FEMA_6'] = ['g35','g48','g40','g05','g22']\n", + "FEMA_subs['FEMA_7'] = ['g31','g19','g20','g29']\n", + "FEMA_subs['FEMA_8'] = ['g30','g38','g56','g46','g49','g08']\n", + "FEMA_subs['FEMA_9'] = ['g06','g32','g04']\n", + "FEMA_subs['FEMA_10'] = ['g53','g41','g16']\n", + "\n", + "#Dict to hold variable loadings\n", + "varContrib = {}\n", + "\n", + "#Multiindexed DataFrame to hold all FEMA SOVI Scores\n", + "#Level 1 of the index is the fema region.\n", + "#Level 2 of the FEMA Region is the county FIPS\n", + "geoLevels = US_All.Geo_FIPS\n", + "femaLevels = FEMA_subs.keys()\n", + "geoLabels = []\n", + "femaLabels = []\n", + "for f in femaLevels:\n", + " femaRegionIndexes = US_All[US_All['stateID'].isin(FEMA_subs[f])].index.values\n", + " geoLabels.extend([US_All.index.get_loc(i) for i in femaRegionIndexes])\n", + " femaLabels.extend(np.repeat(femaLevels.index(f), len(femaRegionIndexes)))\n", + "\n", + "US_femaSub_Multi_Index = pd.MultiIndex(levels=[femaLevels, geoLevels], \n", + " labels=[femaLabels, geoLabels], \n", + " names=['FEMA_Region', 'Geo_FIPS'])\n", + "\n", + "#In the FEMA_Region_Sovi_Score data frame ranks are BY FEMA REGION. \n", + "#The data frame holds both the SOVI score and the county rank\n", + "#This means that there should be 10 counties with rank 1 (one for each FEMA Region)\n", + "FEMA_Region_Sovi_Score = pd.DataFrame(index=US_femaSub_Multi_Index, columns=['sovi', 'rank']) \n", + "\n", + "\n", + "#Create New England conglomerate of states\n", + "#These are the FIPS codes for the states wit hthe letter \"g\" appended\n", + "US_All.loc[US_All.stateID.isin(['g23','g33','g25']), 'stateID'] = 'g23g33g25'\n", + "\n", + "#These are the states in the state level analysis\n", + "stateList = ['g23g33g25', 'g36','g51','g13','g17','g48','g29','g46','g06','g16']\n", + "\n", + "#In the FEMA_State_Sovi_Score data frame ranks are BY STATE. \n", + "#The data frame holds both the SOVI score and the county rank\n", + "#This means that there should be 10 coutnies with rank 1 (one for each state in stateList)\n", + "FEMA_State_Sovi_Score = pd.DataFrame(index=US_All.index[US_All.stateID.isin(stateList)], columns=['sovi', 'rank']) \n", + "\n", + "US_All[US_All.stateID.isin(stateList)]\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##Calculate Sovi" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "for i in FEMA_subs:\n", + " \n", + " #Subset FEMA subregion\n", + " FEMARegionData=US_All[US_All['stateID'].isin(FEMA_subs[i])]\n", + "\n", + " # compute SoVI\n", + " inputData = FEMARegionData.drop(['Geo_FIPS','stateID'], axis = 1, inplace = False)\n", + " pca = SPSS_PCA(inputData, reduce=True, varimax=True)\n", + " sovi_actual = pca.scores_rot.sum(1)\n", + " sovi_actual = pd.DataFrame(sovi_actual, index=FEMARegionData.Geo_FIPS, columns=['sovi'])\n", + " attrib_contribution = pca.weights_rot.sum(1)\n", + " \n", + " FEMA_Region_Sovi_Score.loc[i, 'sovi'] = sovi_actual.values\n", + " ranks_tmp = pd.DataFrame(abs(FEMA_Region_Sovi_Score.loc[i, 'sovi']).rank(method='average'))\n", + " ranks_tmp.columns = ['rank']\n", + " FEMA_Region_Sovi_Score.loc[i, 'rank'] = 1 + (len(FEMA_Region_Sovi_Score.loc[i,'rank']) - ranks_tmp.values) \n", + "\n", + " \n", + " ##Write attribute contribution output \n", + " #Generate dictionary for all net loadings by variable and region\n", + " varContrib[i]=zip(attr_names,attrib_contribution.tolist())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compute National SoVI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "#######################\n", + "##Compute National SoVI\n", + "#######################\n", + "# compute SoVI\n", + "inputData = US_All.drop(['Geo_FIPS','stateID'], axis = 1, inplace = False)\n", + "pca = SPSS_PCA(inputData, reduce=True, varimax=True)\n", + "sovi_actual_us = pca.scores_rot.sum(1)\n", + "sovi_actual_us = pd.DataFrame(sovi_actual_us, index=US_All.Geo_FIPS, columns=['sovi'])\n", + "attrib_contribution_us = pca.weights_rot.sum(1)\n", + "\n", + "#redundancy w/ranks here to preserve compatibility\n", + "US_All_Full_Sovi_Rank = abs(sovi_actual_us).apply(rankdata, axis=0, method='average')\n", + "sovi_actual_us['rank'] = abs(sovi_actual_us).apply(rankdata, axis=0, method='average')\n", + "sovi_actual_us['rank'] = 1 + (len(sovi_actual_us['rank']) - sovi_actual_us['rank'])\n", + "\n", + "#Generate dictionary for all net loadings by variable and region\n", + "varContrib['USA']=zip(attr_names,attrib_contribution.tolist())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compute SoVI for selected state(s) in each FEMA Region" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "#############################################\n", + "##State Analysis \n", + "#############################################\n", + "for st in stateList:\n", + " #Subset FEMA subregion\n", + " stateData=US_All[US_All.stateID == st]\n", + "\n", + " # compute SoVI\n", + " inputData = stateData.drop(['Geo_FIPS','stateID'], axis = 1, inplace = False)\n", + " pca = SPSS_PCA(inputData, reduce=True, varimax=True)\n", + " sovi_actual = pca.scores_rot.sum(1)\n", + " sovi_actual = pd.DataFrame(sovi_actual, index=stateData.Geo_FIPS, columns=['sovi'])\n", + " attrib_contribution = pca.weights_rot.sum(1)\n", + " \n", + " \n", + " #sovi_alt_computation = (zinputs * attrib_contribution).sum(1) # this is just a check\n", + " #sovi_alt_computation = pd.DataFrame(sovi_alt_computation, columns=['sovi'])\n", + " #if not np.allclose(sovi_actual, sovi_alt_computation):\n", + " # raise Exception, \"mismatch\"\n", + " ######################\n", + "'''##Compute spearman correlation\n", + "######################\n", + "##DO THIS USING THE USA AND STATE?FEMA REGION SPECIFIC SOVI\n", + "\n", + "#compute US base ranks for all drop 1 correlations\n", + "spearmanr(US_SoVI_Drop1_Score[])\n", + "\n", + "#rankContrib = (28-rankContrib) + 1\n", + "#compare Fema regions and US\n", + "\n", + "#compare fema regions and states\n", + "\n", + "#compare states and US\n", + "US_SoVI_Drop1_Score.\n", + "#compare drop 1 to full sovi\n", + "scipy.stats.mstats.spearmanr'''\n", + " \n", + " ##Write attribute contribution output \n", + " #Generate dictionary for all net loadings by variable and region\n", + " varContrib[st]=zip(attr_names,attrib_contribution.tolist())" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
USAFEMA_1g23g33g25FEMA_2g36FEMA_3g51FEMA_4g13FEMA_5...FEMA_6g48FEMA_7g29FEMA_8g46FEMA_9g06FEMA_10g16
Age Dependency (%)0.76 (1)0.5 (4)0.56 (2)0.17 (12)0.6 (1)0.38 (7)0.27 (12)0.38 (8)0.15 (20)0.62 (3)...0.56 (4)0.18 (20)0.33 (11)0.44 (8)0.51 (7)0.14 (21)0.38 (10)0.23 (18)0.29 (12)0.12 (24)
Pop Female (%)0.74 (2)0.51 (3)0.42 (8)-0.18 (11)0.53 (5)-0.39 (6)-0.09 (24)-0.26 (11)-0.34 (8)0.86 (1)...0.54 (5)-0.35 (9)0.81 (1)0.64 (1)0.75 (2)0.48 (6)-0.02 (27)-0.56 (3)0.41 (5)0.05 (28)
Service Sector Employment (%)0.65 (3)-0.12 (21)-0.1 (22)0.43 (5)-0.02 (26)0.66 (2)0.23 (14)0.81 (1)0.1 (22)0.11 (22)...0.61 (1)0.39 (6)0.21 (15)0.26 (13)0.2 (20)0.24 (14)0.35 (11)0.32 (11)0.44 (3)0.16 (23)
Pop Hispanic (%)0.56 (4)0.22 (15)0.33 (13)-0.12 (20)0.02 (28)0.63 (3)0.56 (2)0.46 (5)0.37 (6)0.39 (6)...0.38 (10)-0.03 (27)0.4 (7)0.48 (4)0.52 (6)0.5 (5)0.28 (17)0.27 (16)0.33 (9)0.25 (14)
Female Employment (%)0.51 (5)0.1 (22)0.3 (14)0.51 (4)0.45 (6)0.07 (25)-0.19 (16)0.16 (17)-0.3 (11)0.14 (21)...0.48 (6)0.13 (22)0.16 (19)0.54 (2)0.79 (1)0.4 (8)0.28 (16)-0.43 (6)0.41 (4)0.36 (7)
Pop Native American (%)0.49 (6)0.54 (2)0.44 (7)0.57 (1)-0.55 (2)-0.23 (12)-0.3 (10)0.81 (2)-0.53 (1)0.64 (2)...0.45 (8)-0.06 (25)0.58 (2)0.44 (7)-0.08 (27)-0.07 (24)0.4 (9)-0.3 (15)-0.05 (28)0.28 (11)
English as Second Language (%)0.48 (7)0.3 (8)0.36 (11)-0.12 (19)-0.04 (24)0.58 (4)0.59 (1)0.47 (4)0.5 (3)0.47 (5)...0.32 (12)-0.06 (26)0.39 (9)0.44 (6)0.38 (10)0.31 (10)0.49 (5)0.3 (14)0.24 (16)0.21 (16)
Social Security Income (%)0.4 (8)0.46 (5)0.5 (4)0.13 (16)0.31 (8)0.24 (11)0.24 (13)0.25 (12)0.23 (16)0.31 (10)...0.34 (11)0.26 (15)0.17 (18)0.24 (14)0.68 (3)0.36 (9)0.47 (6)0.26 (17)0.27 (14)0.17 (21)
No Automobile (%)0.4 (9)0.08 (25)0.0 (28)0.35 (7)0.21 (14)0.1 (21)0.31 (7)0.24 (13)0.31 (10)0.59 (4)...0.04 (26)0.45 (1)0.34 (10)0.07 (24)0.24 (17)0.02 (27)0.57 (2)0.47 (5)0.32 (11)0.22 (15)
Mobile Homes (%)-0.37 (10)-0.2 (17)-0.47 (5)0.07 (25)-0.02 (27)0.04 (26)0.28 (11)0.04 (27)0.08 (26)-0.25 (14)...-0.59 (2)-0.43 (4)-0.2 (16)0.03 (27)0.25 (16)0.73 (2)-0.15 (21)0.3 (13)0.21 (19)0.4 (5)
Poverty (%)0.34 (11)0.15 (20)0.02 (27)0.25 (8)0.28 (11)0.17 (16)0.33 (6)0.29 (10)0.33 (9)0.25 (13)...0.45 (7)0.33 (12)0.28 (13)0.4 (10)0.27 (14)0.14 (19)0.51 (4)0.19 (19)0.39 (6)0.36 (8)
Nursing Home Residents (%)0.33 (12)0.46 (6)0.35 (12)-0.16 (14)0.54 (4)0.51 (5)0.16 (17)0.52 (3)0.44 (4)0.22 (18)...0.29 (14)0.33 (11)0.43 (6)0.28 (11)-0.29 (13)-0.28 (11)0.86 (1)0.66 (2)0.33 (10)0.17 (22)
Children in Married Families (%)0.29 (13)-0.09 (24)0.05 (25)0.25 (9)0.54 (3)0.29 (9)0.1 (22)0.2 (15)0.08 (25)0.38 (7)...-0.04 (27)0.45 (2)0.5 (3)0.49 (3)0.18 (22)0.28 (12)0.3 (14)0.05 (28)0.5 (1)0.55 (1)
Vacant Housing (%)0.28 (14)-0.24 (13)-0.13 (19)0.51 (3)0.14 (20)0.76 (1)0.51 (4)0.38 (7)0.2 (19)0.31 (9)...0.14 (20)0.28 (14)-0.17 (17)0.4 (9)0.45 (9)0.73 (1)0.45 (8)-0.41 (9)-0.27 (13)-0.28 (12)
Per-Capita Income0.28 (15)0.05 (28)0.09 (23)-0.08 (23)0.13 (21)0.25 (10)0.15 (18)0.15 (20)-0.1 (23)0.24 (17)...0.38 (9)0.19 (19)0.23 (14)0.19 (16)0.27 (15)0.25 (13)0.08 (24)-0.15 (22)0.18 (20)0.2 (17)
Pop African-American (%)-0.27 (16)0.09 (23)0.05 (24)0.06 (27)0.04 (23)0.19 (14)-0.11 (20)-0.12 (22)0.09 (24)-0.06 (26)...-0.57 (3)0.44 (3)0.06 (25)-0.02 (28)0.34 (11)0.46 (7)-0.53 (3)-0.31 (12)-0.24 (18)0.39 (6)
Median Rent0.23 (17)0.27 (10)0.2 (16)0.13 (17)0.3 (10)0.04 (27)0.08 (25)0.06 (24)0.3 (13)0.24 (16)...0.3 (13)0.35 (8)0.31 (12)0.18 (18)-0.12 (25)0.15 (18)0.06 (26)0.07 (26)0.07 (26)0.06 (27)
Annual Income >$200K (%)0.22 (18)-0.2 (18)-0.16 (18)0.07 (24)0.22 (13)0.12 (19)-0.04 (27)0.21 (14)-0.3 (12)0.1 (24)...0.26 (15)0.16 (21)0.03 (27)0.05 (26)0.31 (12)0.55 (3)-0.15 (22)-0.15 (23)0.15 (22)0.17 (20)
Median Age0.21 (19)0.25 (12)0.38 (9)0.17 (13)0.31 (9)0.18 (15)0.11 (19)0.16 (19)0.26 (15)0.24 (15)...0.09 (25)0.13 (23)0.02 (28)0.16 (21)0.47 (8)0.21 (15)0.28 (15)0.07 (27)0.08 (23)0.07 (26)
Female-Headed Households (%)0.21 (20)0.26 (11)0.56 (3)-0.05 (28)0.27 (12)0.1 (22)-0.04 (28)-0.01 (28)-0.07 (27)0.25 (12)...0.09 (24)0.22 (18)0.5 (4)0.21 (15)0.01 (28)0.06 (25)0.22 (19)-0.15 (24)0.45 (2)0.5 (2)
Rental Housing (%)0.17 (21)0.06 (27)-0.03 (26)0.09 (21)0.17 (16)-0.13 (18)0.1 (21)0.3 (9)-0.11 (21)-0.08 (25)...0.23 (17)0.23 (16)0.07 (24)0.16 (19)-0.17 (23)-0.04 (26)-0.08 (25)0.4 (10)0.37 (7)0.46 (3)
Population Density0.16 (22)-0.07 (26)-0.19 (17)0.38 (6)0.17 (17)0.1 (20)0.31 (9)0.16 (18)0.23 (17)0.27 (11)...0.2 (18)0.32 (13)0.1 (23)0.08 (23)0.65 (4)0.21 (16)0.35 (12)0.41 (7)-0.25 (15)0.18 (19)
Unemployment (%)-0.15 (23)0.69 (1)0.74 (1)0.2 (10)-0.17 (18)0.21 (13)0.31 (8)0.16 (16)0.35 (7)-0.22 (19)...-0.11 (23)0.4 (5)-0.11 (22)-0.18 (17)0.52 (5)0.13 (22)0.26 (18)0.41 (8)0.15 (21)0.28 (13)
Median Home Value0.14 (24)0.27 (9)0.11 (21)-0.06 (26)0.12 (22)0.09 (23)0.09 (23)0.05 (26)0.04 (28)0.11 (23)...0.18 (19)0.34 (10)0.43 (5)0.27 (12)-0.1 (26)0.09 (23)-0.34 (13)-0.16 (21)0.08 (24)0.32 (10)
Adults Completed <Grade 12 (%)0.14 (25)0.42 (7)0.45 (6)0.08 (22)0.19 (15)0.32 (8)0.49 (5)0.14 (21)0.27 (14)0.21 (20)...0.11 (22)0.08 (24)0.39 (8)-0.05 (25)0.23 (19)-0.01 (28)0.46 (7)0.52 (4)0.34 (8)0.42 (4)
Extractive Sector Employment (%)0.11 (26)0.23 (14)0.22 (15)-0.15 (15)0.03 (25)-0.02 (28)0.51 (3)0.06 (25)0.51 (2)0.32 (8)...-0.23 (16)0.0 (28)-0.05 (26)0.11 (22)-0.17 (24)0.15 (17)-0.21 (20)0.89 (1)0.05 (27)0.09 (25)
Pop Asian (%)-0.09 (27)0.21 (16)0.12 (20)-0.12 (18)-0.16 (19)0.07 (24)0.2 (15)0.07 (23)0.22 (18)0.02 (28)...0.12 (21)0.23 (17)-0.15 (20)0.16 (20)0.23 (18)0.51 (4)-0.0 (28)-0.1 (25)-0.24 (17)0.33 (9)
Persons Per Housing Unit0.02 (28)0.17 (19)0.36 (10)-0.55 (2)-0.35 (7)0.13 (17)-0.04 (26)-0.41 (6)-0.37 (5)0.06 (27)...-0.02 (28)-0.37 (7)0.13 (21)-0.46 (5)-0.19 (21)-0.14 (20)0.14 (23)-0.19 (20)-0.07 (25)-0.2 (18)
\n", + "

28 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " USA FEMA_1 g23g33g25 \\\n", + "Age Dependency (%) 0.76 (1) 0.5 (4) 0.56 (2) \n", + "Pop Female (%) 0.74 (2) 0.51 (3) 0.42 (8) \n", + "Service Sector Employment (%) 0.65 (3) -0.12 (21) -0.1 (22) \n", + "Pop Hispanic (%) 0.56 (4) 0.22 (15) 0.33 (13) \n", + "Female Employment (%) 0.51 (5) 0.1 (22) 0.3 (14) \n", + "Pop Native American (%) 0.49 (6) 0.54 (2) 0.44 (7) \n", + "English as Second Language (%) 0.48 (7) 0.3 (8) 0.36 (11) \n", + "Social Security Income (%) 0.4 (8) 0.46 (5) 0.5 (4) \n", + "No Automobile (%) 0.4 (9) 0.08 (25) 0.0 (28) \n", + "Mobile Homes (%) -0.37 (10) -0.2 (17) -0.47 (5) \n", + "Poverty (%) 0.34 (11) 0.15 (20) 0.02 (27) \n", + "Nursing Home Residents (%) 0.33 (12) 0.46 (6) 0.35 (12) \n", + "Children in Married Families (%) 0.29 (13) -0.09 (24) 0.05 (25) \n", + "Vacant Housing (%) 0.28 (14) -0.24 (13) -0.13 (19) \n", + "Per-Capita Income 0.28 (15) 0.05 (28) 0.09 (23) \n", + "Pop African-American (%) -0.27 (16) 0.09 (23) 0.05 (24) \n", + "Median Rent 0.23 (17) 0.27 (10) 0.2 (16) \n", + "Annual Income >$200K (%) 0.22 (18) -0.2 (18) -0.16 (18) \n", + "Median Age 0.21 (19) 0.25 (12) 0.38 (9) \n", + "Female-Headed Households (%) 0.21 (20) 0.26 (11) 0.56 (3) \n", + "Rental Housing (%) 0.17 (21) 0.06 (27) -0.03 (26) \n", + "Population Density 0.16 (22) -0.07 (26) -0.19 (17) \n", + "Unemployment (%) -0.15 (23) 0.69 (1) 0.74 (1) \n", + "Median Home Value 0.14 (24) 0.27 (9) 0.11 (21) \n", + "Adults Completed $200K (%) 0.07 (24) 0.22 (13) 0.12 (19) \n", + "Median Age 0.17 (13) 0.31 (9) 0.18 (15) \n", + "Female-Headed Households (%) -0.05 (28) 0.27 (12) 0.1 (22) \n", + "Rental Housing (%) 0.09 (21) 0.17 (16) -0.13 (18) \n", + "Population Density 0.38 (6) 0.17 (17) 0.1 (20) \n", + "Unemployment (%) 0.2 (10) -0.17 (18) 0.21 (13) \n", + "Median Home Value -0.06 (26) 0.12 (22) 0.09 (23) \n", + "Adults Completed $200K (%) -0.04 (27) 0.21 (14) -0.3 (12) \n", + "Median Age 0.11 (19) 0.16 (19) 0.26 (15) \n", + "Female-Headed Households (%) -0.04 (28) -0.01 (28) -0.07 (27) \n", + "Rental Housing (%) 0.1 (21) 0.3 (9) -0.11 (21) \n", + "Population Density 0.31 (9) 0.16 (18) 0.23 (17) \n", + "Unemployment (%) 0.31 (8) 0.16 (16) 0.35 (7) \n", + "Median Home Value 0.09 (23) 0.05 (26) 0.04 (28) \n", + "Adults Completed $200K (%) 0.1 (24) ... 0.26 (15) \n", + "Median Age 0.24 (15) ... 0.09 (25) \n", + "Female-Headed Households (%) 0.25 (12) ... 0.09 (24) \n", + "Rental Housing (%) -0.08 (25) ... 0.23 (17) \n", + "Population Density 0.27 (11) ... 0.2 (18) \n", + "Unemployment (%) -0.22 (19) ... -0.11 (23) \n", + "Median Home Value 0.11 (23) ... 0.18 (19) \n", + "Adults Completed $200K (%) 0.16 (21) 0.03 (27) 0.05 (26) \n", + "Median Age 0.13 (23) 0.02 (28) 0.16 (21) \n", + "Female-Headed Households (%) 0.22 (18) 0.5 (4) 0.21 (15) \n", + "Rental Housing (%) 0.23 (16) 0.07 (24) 0.16 (19) \n", + "Population Density 0.32 (13) 0.1 (23) 0.08 (23) \n", + "Unemployment (%) 0.4 (5) -0.11 (22) -0.18 (17) \n", + "Median Home Value 0.34 (10) 0.43 (5) 0.27 (12) \n", + "Adults Completed $200K (%) 0.31 (12) 0.55 (3) -0.15 (22) \n", + "Median Age 0.47 (8) 0.21 (15) 0.28 (15) \n", + "Female-Headed Households (%) 0.01 (28) 0.06 (25) 0.22 (19) \n", + "Rental Housing (%) -0.17 (23) -0.04 (26) -0.08 (25) \n", + "Population Density 0.65 (4) 0.21 (16) 0.35 (12) \n", + "Unemployment (%) 0.52 (5) 0.13 (22) 0.26 (18) \n", + "Median Home Value -0.1 (26) 0.09 (23) -0.34 (13) \n", + "Adults Completed $200K (%) -0.15 (23) 0.15 (22) 0.17 (20) \n", + "Median Age 0.07 (27) 0.08 (23) 0.07 (26) \n", + "Female-Headed Households (%) -0.15 (24) 0.45 (2) 0.5 (2) \n", + "Rental Housing (%) 0.4 (10) 0.37 (7) 0.46 (3) \n", + "Population Density 0.41 (7) -0.25 (15) 0.18 (19) \n", + "Unemployment (%) 0.41 (8) 0.15 (21) 0.28 (13) \n", + "Median Home Value -0.16 (21) 0.08 (24) 0.32 (10) \n", + "Adults Completed