diff --git a/assignment9.py b/assignment9.py new file mode 100644 index 0000000..b7a882d --- /dev/null +++ b/assignment9.py @@ -0,0 +1,49 @@ +""" +Created on Mon Dec 5 01:37:07 2016 + +@author: Shucheng Yan +net ID: sy1253 + +credit wz1070 for data structure insight and unitest +""" + +import pandas as pd +import matplotlib.pyplot as plt +import sys +from dataAnalysis import * +from summaryInfo import * + +def main(): + countries = pd.read_csv('countries.csv') + income = pd.read_excel('indicator gapminder gdp_per_capita_ppp.xlsx', index_col = 0).transpose() + print(income.head()) + + while True: + try: + input_year = input("please enter a year between 1800 and 2012 to see income distribution: ") + if input_year == 'finish': + for year in range(2007, 2013): + merged_data = mergeByYear(year, income, countries) + # create an obeject for each year + mergedPresent = dataAnalysis(year) + #present each year's histogram and boxplot + mergedPresent.histogram(merged_data) + mergedPresent.boxplot(merged_data) + break + + elif int(input_year) >= 1800 and int(input_year) <= 2012: + inputYear = int(input_year) + displaySummary(inputYear, income) + + except InputValueException: + print("please enter a valid integer year between 1800 and 2012") + except KeyboardInterrupt: + sys.exit() + except EOFError: + sys.exit() + + +if __name__ == "__main__": + main() + + \ No newline at end of file diff --git a/dataAnalysis.py b/dataAnalysis.py new file mode 100644 index 0000000..b94690d --- /dev/null +++ b/dataAnalysis.py @@ -0,0 +1,39 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Dec 5 10:43:13 2016 + +@author: kevinyan +""" + +import matplotlib.pyplot as plt + + + +class dataAnalysis(): + + def __init__(self, year): + self.year = year + + + def histogram(self, income): + """ + The function plots a histogram of the distribution of the income per person by region data + """ + income.hist(by = 'Region', figsize = (12, 12), fontsize = 12) + plt.title("histogram of the distribution of the income in " + str(self.year)) + plt.xlabel('Region') + plt.ylabel('Income Per Person') + plt.savefig('histogram in' + str(self.year) + '.pdf') + plt.show() + + + def boxplot(self, income): + """ + The function plots a boxplot to show the distribution of the income per person by region data + """ + income.boxplot(by = 'Region', figsize = (12, 12), fontsize = 12) + plt.title("boxplot of the distribution of the income in " + str(self.year)) + plt.xlabel('Region') + plt.ylabel('Income Per Person') + plt.savefig('boxplot in ' + str(self.year) + '.pdf') + plt.show() \ No newline at end of file diff --git a/results.txt b/results.txt new file mode 100644 index 0000000..2270c40 --- /dev/null +++ b/results.txt @@ -0,0 +1,7 @@ +Over the period, + +1. Africa has the largest percentage increase in income per person, but most of countries still remain poor compare to countries in other regions. + +2. The European and North America’s income per person decreased noticeably in 2008-2009 potentially due to the financial crisis. + +3. Other regions have relatively stable economic landscapes with small increase in the income per person. diff --git a/summaryInfo.py b/summaryInfo.py new file mode 100644 index 0000000..a939300 --- /dev/null +++ b/summaryInfo.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Dec 5 01:38:39 2016 + +@author: kevinyan +""" +import pandas as pd +import matplotlib.pyplot as plt + +def mergeByYear(year, income, countries): + """ + this function merges the countries and income data sets for any given year + """ + incomeSingle = pd.DataFrame(income.loc[year]).reset_index() + #change the category name is the previous file + incomeSingle = incomeSingle.rename(columns = {'gdp pc test':'Country'}) + incomeSingle = incomeSingle.rename(columns = {year:'Income'}) + mergedYear = pd.merge(countries, incomeSingle, on='Country') + return mergedYear + + +def displaySummary(year, income): + """ + this function uses histogram to present the distribution of income per person across all countries + """ + plt.hist(income.loc[year].dropna()) + plt.title("distribution of income per person across all countries in the world in " + str(year)) + plt.xlabel("income per person") + plt.ylabel("number of countries") + plt.show() + + + + + + + diff --git a/test.py b/test.py new file mode 100644 index 0000000..3bcd277 --- /dev/null +++ b/test.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Dec 5 13:21:15 2016 + +@author: kevinyan +""" + +import unittest +import pandas as pd +from summaryInfo import * + +countries = pd.read_csv('countries.csv') +income = pd.read_excel('indicator gapminder gdp_per_capita_ppp.xlsx', index_col = 0).transpose() + + +class Test(unittest.TestCase): + + def testMerged(self): + mergedTest = mergeByYear(2007,income,countries) + #check if the dataFrame is correctly constructed + self.assertTrue(all(map(lambda x: x in ['Country', 'Region', 'Income'], mergedTest))) + self.assertTrue(mergedTest['Country'][4], 'Burundi') + self.assertTrue(mergedTest['Region'][5], 'AFRICA') + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file