-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdataexplorer.py
More file actions
135 lines (109 loc) · 3.78 KB
/
dataexplorer.py
File metadata and controls
135 lines (109 loc) · 3.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import xml.etree.ElementTree as ET
# from pymongo import MongoClient
import psycopg2
import re
import pandas as pd
import json
# sample_file = ('startups.stackexchange.com/Tags.xml')
def read_xml(file):
"""
parses xml filepath and stores the data as an ElementTree
INPUTS:
-------
file: string
OUTPUTS:
--------
data: Element Tree type file
"""
parsed_data = ET.parse(file)
root = parsed_data.getroot()
return root
def tag_cleaner(string):
"""
removes the '<' and '>' characters from string and converts into listself.
if the string is empty, returns an empty list
i.e. <tag1><tag2> => [tag1, tag2]
"""
if string:
cleaned_tag = re.sub(r'<','', string)
tags = cleaned_tag.split('>')[:-1]
else:
tags = [None]
return(tags)
def get_schema(schema_file):
"""
will get schema files (this is currently from csv)
from there abstract the index and the datatype
"""
schema = pd.read_csv(schema_file)
# remove the ' (PK)' suffix from the column_name
schema['column_name'].iloc[0] = schema['column_name'].iloc[0][0:-5]
schema[['column_name', 'data_type']]
schema = pd.Series(data = schema['data_type'].values,
index=schema['column_name'])
return(schema)
def make_post_dataframe(data, schema, col_to_clean = None):
"""
from xml data, schema series, makes a pandas df. posts with more than one tag are
duplicated to form multiple rows
"""
if col_to_clean:
data_columns = schema.index.drop(col_to_clean)
else:
data_columns = schema.index
all_data =[]
for row in data:
raw_tags = row.attrib.get(col_to_clean)
tags = tag_cleaner(raw_tags)
#row_data is an empty dict, populate row with cleaned_values
row_data ={}
for column in data_columns:
row_data[column] = row.attrib.get(column)
for tag in tags:
cleaned_row = {col_to_clean:tag}
row_data = {**row_data, **cleaned_row}
all_data.append(row_data)
master_data = pd.DataFrame(
data=all_data,
columns=schema.index)
return(master_data)
def infer_schema(dataframe, schema):
"""
given a dataframe and schema, force datatypes to match schema
"""
sql_to_python = json.load(open('schema.json'))
columns = dataframe.columns
column_to_dtype ={}
for column in columns:
column_to_dtype[column] = sql_to_python[schema[column]]
dataframe = dataframe.astype(column_to_dtype)
return(dataframe)
def make_postgres_table(data, keys, table_name, database_name):
"""
inputs an element tree data structure and then makes makes a postgres table for a given database
"""
pconn = psycopg2.connect(
dbname = '{}'.format(database_name),
user = 'postgres', host ='localhost')
pconn.cursor.execute('')
pcur= pconn.cursor()
sql_statement = """ INSERT INTO \
{} (Id, TagName, Count, ExcerptPostId, WikiPostId) VALUES \
('{}','{}','{}','{}','{}')""".format(table_name, *keys)
for child in data[0:5]:
print('Inserting \n {} \n'.format(child))
pcur.execute(sql_statement)
pconn.commit()
if __name__ == "__main__":
sample_file = ('startups.stackexchange.com/Posts.xml')
schema = get_schema("QueryResults.csv")
data = read_xml(sample_file)
df = make_post_dataframe(
data,
schema,
col_to_clean = 'Tags'
)
df = infer_schema(df, schema)
print(df.dtypes)
#
# make_postgres_table(data, column_names, 'Tags', 'startups.stackexchange')