Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 42 additions & 3 deletions data/ares/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from urllib.request import urlopen, urlretrieve

import lxml.etree
import re

BASE_URL = "https://wwwinfo.mfcr.cz/ares/ares_vreo_all.tar.gz"

Expand All @@ -21,15 +22,40 @@ def attr(root, parts, nsmap):

return ret

def attri(root, parts, nsmap, fnc):
ret = []
dat_d = {}
for j in parts:
el = root.findall("./are:%s" % j, namespaces=nsmap)
if len(el) > 0:
for x in fnc(el):
f_dict=json.loads(x)
for k in f_dict.keys():
dat_d[k] = dat_d.get(k,"") + f_dict[k] + ";"

ret.append(json.dumps(dat_d,ensure_ascii=False))
return ret

def obj(root):
def obj(root, multiple_same_tag = False):
if root is None:
return None
els = {j.tag: j.text for j in root.getchildren()}
els = {j[j.rindex("}") + 1 :]: k for j, k in els.items()}

els = {}
if multiple_same_tag:
{els.setdefault(root.tag + j.tag, [] ).append(j.text.strip().replace('\n'," ").replace('\t'," ").replace('"','').replace(u'\xa0',' ')) for i,j in enumerate(root.getchildren())}
pp = re.compile("\{.*\}(.+)\{.*\}(.+)")
els = { pp.match(j).group(1) + pp.match(j).group(2) : ";".join(k) for j,k in els.items()}
else:
els = {j.tag : j.text.strip().replace('\n'," ").replace('\t'," ").replace('"','').replace(u'\xa0',' ') for j in root.getchildren()}
els = {j[j.rindex("}") + 1 :]: k for j, k in els.items()}

return json.dumps(els, ensure_ascii=False)

def list_obj(el):
ret = []
for eli in el:
ret.append(obj(eli, True))
return ret

def organi(root, ico, nsmap):
nazev = root.find("./are:Nazev", namespaces=nsmap).text
Expand Down Expand Up @@ -104,6 +130,7 @@ def main(outdir: str, partial: bool = False):
"datum_zapisu",
"datum_vymazu",
"sidlo",
"cinnosti",
]
udc.writerow(cols)
foc.writerow(
Expand Down Expand Up @@ -190,6 +217,18 @@ def main(outdir: str, partial: bool = False):
dt.extend(zi)
dt.append(obj(zakl.find("./are:Sidlo", namespaces=et.nsmap)))

# zaznamy o predmetu cinnosti
cinn = zakl.find("./are:Cinnosti", namespaces=et.nsmap)
cinn_cols = [
"PredmetPodnikani",
"Ucel",
"DoplnkovaCinnost",
"PredmetCinnosti",
]

pr = attri(cinn, cinn_cols, et.nsmap, list_obj) if cinn is not None else ['{}']
dt.extend(pr)

# zapis dat do master tabulky
udc.writerow(dt)

Expand Down
1 change: 1 addition & 0 deletions data/ares/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
Column("datum_zapisu", Date, nullable=False),
Column("datum_vymazu", Date, nullable=True),
Column("sidlo", JSON, nullable=True),
Column("cinnosti", JSON, nullable=True),
),
Table(
"fosoby",
Expand Down
9 changes: 9 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import argparse
import csv
from functools import partial
import os
import shutil
from collections import defaultdict
from importlib import import_module
from datetime import datetime

from sqlalchemy import Boolean, create_engine

Expand Down Expand Up @@ -33,8 +35,10 @@
action="store_true",
help="procesuj jen cast vstupnich dat - vhodne pro testovani, CI apod.",
)
parser.add_argument("--timesubf", action="store_true", help="Ve vystupnim adresari vytvori podadresar s casovy razitkem do ktereho se teprve budou ukladat zpracovana data z jednoltivych modulu.")
parser.add_argument("--all", action="store_true", help="procesuj vsechny moduly")
parser.add_argument("modules", nargs="*", help="specify which datasets to include")

args = parser.parse_args()

if args.all and len(args.modules) > 0:
Expand All @@ -48,6 +52,11 @@
)

base_outdir = "csv"

if args.timesubf:
prefix_d = "full_" if not(args.partial) else "partial_"
base_outdir = os.path.join(base_outdir, prefix_d + datetime.now().strftime("%Y%m%d%H%M%S"))

os.makedirs(base_outdir, exist_ok=True)

engine = None
Expand Down