StackExchange-Analysis/dataexplorer.py at master · Denton044/StackExchange-Analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import xml.etree.ElementTree as ET
# from pymongo import MongoClient
import psycopg2
import re
import pandas as pd
import json

# sample_file = ('startups.stackexchange.com/Tags.xml')


def read_xml(file):
    """
    parses xml filepath and stores the data as an ElementTree

    INPUTS:
    -------
    file: string

    OUTPUTS:
    --------
    data: Element Tree type file
    """
    parsed_data = ET.parse(file)
    root = parsed_data.getroot()
    return root

def tag_cleaner(string):
    """
    removes the '<' and '>' characters from string and converts into listself.
    if the string is empty, returns an empty list
    i.e. <tag1><tag2> => [tag1, tag2]
    """
    if string:
        cleaned_tag = re.sub(r'<','', string)
        tags = cleaned_tag.split('>')[:-1]
    else:
        tags = [None]
    return(tags)

def get_schema(schema_file):
    """
    will get schema files (this is currently from csv)
    from there abstract the index and the datatype

    """
    schema = pd.read_csv(schema_file)
    # remove the ' (PK)' suffix from the column_name
    schema['column_name'].iloc[0] = schema['column_name'].iloc[0][0:-5]
    schema[['column_name', 'data_type']]
    schema = pd.Series(data = schema['data_type'].values,
                       index=schema['column_name'])
    return(schema)


def make_post_dataframe(data, schema, col_to_clean = None):
    """
    from xml data, schema series, makes a pandas df. posts with more than one tag are
    duplicated to form multiple rows
    """

    if col_to_clean:
        data_columns = schema.index.drop(col_to_clean)
    else:
        data_columns = schema.index

    all_data =[]

    for row in data:
        raw_tags = row.attrib.get(col_to_clean)
        tags = tag_cleaner(raw_tags)
        #row_data is an empty dict, populate row with cleaned_values
        row_data ={}
        for column in data_columns:
            row_data[column] = row.attrib.get(column)

        for tag in tags:
            cleaned_row = {col_to_clean:tag}
            row_data = {**row_data, **cleaned_row}
            all_data.append(row_data)

    master_data = pd.DataFrame(
                               data=all_data,
                               columns=schema.index)
    return(master_data)


def infer_schema(dataframe, schema):
    """
    given a dataframe and schema, force datatypes to match schema
    """
    sql_to_python = json.load(open('schema.json'))
    columns = dataframe.columns
    column_to_dtype ={}
    for column in columns:
        column_to_dtype[column] = sql_to_python[schema[column]]
    dataframe = dataframe.astype(column_to_dtype)
    return(dataframe)


def make_postgres_table(data, keys, table_name, database_name):
    """
    inputs an element tree data structure and then makes makes a postgres table for a given database
    """

    pconn = psycopg2.connect(
                            dbname = '{}'.format(database_name),
                            user = 'postgres', host ='localhost')
    pconn.cursor.execute('')
    pcur= pconn.cursor()

    sql_statement = """ INSERT INTO \
                    {} (Id, TagName, Count, ExcerptPostId, WikiPostId) VALUES \
                    ('{}','{}','{}','{}','{}')""".format(table_name, *keys)

    for child in data[0:5]:
        print('Inserting \n {} \n'.format(child))
        pcur.execute(sql_statement)
    pconn.commit()

if __name__ == "__main__":
    sample_file = ('startups.stackexchange.com/Posts.xml')
    schema = get_schema("QueryResults.csv")
    data = read_xml(sample_file)
    df = make_post_dataframe(
                            data,
                            schema,
                            col_to_clean = 'Tags'
                            )
    df = infer_schema(df, schema)
    print(df.dtypes)

    #
    # make_postgres_table(data, column_names, 'Tags', 'startups.stackexchange')