From a5fa55cbb36814b6ad2a8dd94c58462ff850e967 Mon Sep 17 00:00:00 2001 From: Martijn van Exel Date: Sat, 29 Oct 2016 12:25:22 -0600 Subject: [PATCH 01/24] consistently use print as a function --- changesetmd.py | 50 +++++++++++++++++++++++++------------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/changesetmd.py b/changesetmd.py index 18d2e25..e9e4f3b 100755 --- a/changesetmd.py +++ b/changesetmd.py @@ -33,7 +33,7 @@ def __init__(self, createGeometry): self.createGeometry = createGeometry def truncateTables(self, connection): - print 'truncating tables' + print('truncating tables') cursor = connection.cursor() cursor.execute("TRUNCATE TABLE osm_changeset_comment CASCADE;") cursor.execute("TRUNCATE TABLE osm_changeset CASCADE;") @@ -42,7 +42,7 @@ def truncateTables(self, connection): connection.commit() def createTables(self, connection): - print 'creating tables' + print('creating tables') cursor = connection.cursor() cursor.execute(queries.createChangesetTable) cursor.execute(queries.initStateTable) @@ -113,23 +113,23 @@ def parseFile(self, connection, changesetFile, doReplication): elem.attrib.get('user', None), tags, comments) if((parsedCount % 10000) == 0): - print "parsed %s" % ('{:,}'.format(parsedCount)) - print "cumulative rate: %s/sec" % '{:,.0f}'.format(parsedCount/timedelta.total_seconds(datetime.now() - startTime)) + print("parsed %s" % ('{:,}'.format(parsedCount))) + print("cumulative rate: %s/sec" % '{:,.0f}'.format(parsedCount/timedelta.total_seconds(datetime.now() - startTime))) #clear everything we don't need from memory to avoid leaking elem.clear() while elem.getprevious() is not None: del elem.getparent()[0] connection.commit() - print "parsing complete" - print "parsed {:,}".format(parsedCount) + print("parsing complete") + print("parsed {:,}".format(parsedCount)) def fetchReplicationFile(self, sequenceNumber): topdir = format(sequenceNumber / 1000000, '003') subdir = format((sequenceNumber / 1000) % 1000, '003') fileNumber = format(sequenceNumber % 1000, '003') fileUrl = BASE_REPL_URL + topdir + '/' + subdir + '/' + fileNumber + '.osm.gz' - print "opening replication file at " + fileUrl + print("opening replication file at " + fileUrl) replicationFile = urllib2.urlopen(fileUrl) replicationData = StringIO(replicationFile.read()) return gzip.GzipFile(fileobj=replicationData) @@ -139,7 +139,7 @@ def doReplication(self, connection): try: cursor.execute('LOCK TABLE osm_changeset_state IN ACCESS EXCLUSIVE MODE NOWAIT') except psycopg2.OperationalError as e: - print "error getting lock on state table. Another process might be running" + print("error getting lock on state table. Another process might be running") return 1 cursor.execute('select * from osm_changeset_state') dbStatus = cursor.fetchone() @@ -149,12 +149,12 @@ def doReplication(self, connection): newTimestamp = None if(dbStatus['last_timestamp'] is not None): timestamp = dbStatus['last_timestamp'] - print "latest timestamp in database: " + str(timestamp) + print("latest timestamp in database: " + str(timestamp)) if(dbStatus['update_in_progress'] == 1): - print "concurrent update in progress. Bailing out!" + print("concurrent update in progress. Bailing out!") return 1 if(lastDbSequence == -1): - print "replication state not initialized. You must set the sequence number first." + print("replication state not initialized. You must set the sequence number first.") return 1 cursor.execute('update osm_changeset_state set update_in_progress = 1') connection.commit() @@ -166,12 +166,12 @@ def doReplication(self, connection): try: serverState = yaml.load(urllib2.urlopen(BASE_REPL_URL + "state.yaml")) lastServerSequence = serverState['sequence'] - print "got sequence" + print("got sequence") lastServerTimestamp = serverState['last_run'] - print "last timestamp on server: " + str(lastServerTimestamp) + print("last timestamp on server: " + str(lastServerTimestamp)) except Exception as e: - print "error retrieving server state file. Bailing on replication" - print e + print("error retrieving server state file. Bailing on replication") + print(e) returnStatus = 2 else: try: @@ -187,8 +187,8 @@ def doReplication(self, connection): timestamp = lastServerTimestamp print("finished with replication. Clearing status record") except Exception as e: - print "error during replication" - print e + print("error during replication") + print(e) returnStatus = 2 cursor.execute('update osm_changeset_state set update_in_progress = 0, last_timestamp = %s', (timestamp,)) connection.commit() @@ -231,9 +231,9 @@ def doReplication(self, connection): if not (args.fileName is None): if args.createGeometry: - print 'parsing changeset file with geometries' + print('parsing changeset file with geometries') else: - print 'parsing changeset file' + print('parsing changeset file') changesetFile = None if(args.doReplication): changesetFile = gzip.open(args.fileName, 'rb') @@ -242,7 +242,7 @@ def doReplication(self, connection): if(bz2Support): changesetFile = BZ2File(args.fileName) else: - print 'ERROR: bzip2 support not available. Unzip file first or install bz2file' + print('ERROR: bzip2 support not available. Unzip file first or install bz2file') sys.exit(1) else: changesetFile = open(args.fileName, 'rb') @@ -250,14 +250,14 @@ def doReplication(self, connection): if(changesetFile != None): md.parseFile(conn, changesetFile, args.doReplication) else: - print 'ERROR: no changeset file opened. Something went wrong in processing args' + print('ERROR: no changeset file opened. Something went wrong in processing args') sys.exist(1) if(not args.doReplication): cursor = conn.cursor() - print 'creating constraints' + print('creating constraints') cursor.execute(queries.createConstraints) - print 'creating indexes' + print('creating indexes') cursor.execute(queries.createIndexes) if args.createGeometry: cursor.execute(queries.createGeomIndex) @@ -268,6 +268,6 @@ def doReplication(self, connection): endTime = datetime.now() timeCost = endTime - beginTime - print 'Processing time cost is ', timeCost + print('Processing time cost is ', timeCost) - print 'All done. Enjoy your (meta)data!' + print('All done. Enjoy your (meta)data!') From 8c85a063961c00c5488ebd94e3026f6555cb8b65 Mon Sep 17 00:00:00 2001 From: Martijn van Exel Date: Sat, 29 Oct 2016 12:28:59 -0600 Subject: [PATCH 02/24] switching to requests in favor of urllib --- changesetmd.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/changesetmd.py b/changesetmd.py index e9e4f3b..e0ebf3c 100755 --- a/changesetmd.py +++ b/changesetmd.py @@ -13,12 +13,11 @@ import psycopg2.extras import queries import gzip -import urllib2 +import requests import yaml from lxml import etree from datetime import datetime from datetime import timedelta -from StringIO import StringIO try: from bz2file import BZ2File @@ -130,8 +129,8 @@ def fetchReplicationFile(self, sequenceNumber): fileNumber = format(sequenceNumber % 1000, '003') fileUrl = BASE_REPL_URL + topdir + '/' + subdir + '/' + fileNumber + '.osm.gz' print("opening replication file at " + fileUrl) - replicationFile = urllib2.urlopen(fileUrl) - replicationData = StringIO(replicationFile.read()) + replicationFile = requests.get(fileUrl) + replicationData = replicationFile.text return gzip.GzipFile(fileobj=replicationData) def doReplication(self, connection): @@ -164,7 +163,7 @@ def doReplication(self, connection): #at the end of this method to unlock the database or an error will forever leave it locked returnStatus = 0 try: - serverState = yaml.load(urllib2.urlopen(BASE_REPL_URL + "state.yaml")) + serverState = yaml.load(requests.get(BASE_REPL_URL + "state.yaml").text) lastServerSequence = serverState['sequence'] print("got sequence") lastServerTimestamp = serverState['last_run'] From 36aa720acaa2feaa029ead860afd59c388ac0bf8 Mon Sep 17 00:00:00 2001 From: Martijn van Exel Date: Sat, 29 Oct 2016 12:30:06 -0600 Subject: [PATCH 03/24] adding requirements.txt --- README.md | 4 +++- requirements.txt | 4 ++++ 2 files changed, 7 insertions(+), 1 deletion(-) create mode 100644 requirements.txt diff --git a/README.md b/README.md index 4b7da69..12e2e01 100644 --- a/README.md +++ b/README.md @@ -8,11 +8,13 @@ It can also keep a database created with a weekly dump file up to date using min Setup ------------ -ChangesetMD works with python 2.7. +ChangesetMD works with python 2.7 and Python 3.5. Aside from postgresql, ChangesetMD depends on the python libraries psycopg2 and lxml. On Debian-based systems this means installing the python-psycopg2 and python-lxml packages. +If you are using `pip` and `virtualenv`, you can install all dependencies with `pip install -r requirements.txt`. + If you want to parse the changeset file without first unzipping it, you will also need to install the [bz2file library](http://pypi.python.org/pypi/bz2file) since the built in bz2 library can not handle multi-stream bzip files. For building geometries, ```postgis``` extension needs to be [installed](http://postgis.net/install). diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..0491196 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +lxml==3.6.4 +psycopg2==2.6.2 +PyYAML==3.12 +requests==2.11.1 From a347a89c18797a42b1825ec3d87b750c4bb238dd Mon Sep 17 00:00:00 2001 From: Martijn van Exel Date: Sat, 29 Oct 2016 13:22:59 -0600 Subject: [PATCH 04/24] finishing up py3 compatibility --- changesetmd.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/changesetmd.py b/changesetmd.py index e0ebf3c..6b33e21 100755 --- a/changesetmd.py +++ b/changesetmd.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python ''' ChangesetMD is a simple XML parser to read the weekly changeset metadata dumps from OpenStreetmap into a postgres database for querying. @@ -79,7 +79,10 @@ def parseFile(self, connection, changesetFile, doReplication): startTime = datetime.now() cursor = connection.cursor() context = etree.iterparse(changesetFile) - action, root = context.next() + if sys.version_info[0] < 3: + action, root = context.next() + else: + action, root = next(context) for action, elem in context: if(elem.tag != 'changeset'): continue From 087206104deacd1c798cdc56aff4cb488a2ea3c3 Mon Sep 17 00:00:00 2001 From: Martijn van Exel Date: Sat, 29 Oct 2016 13:23:50 -0600 Subject: [PATCH 05/24] add bz2file to requirements --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 0491196..c6e8b51 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ lxml==3.6.4 psycopg2==2.6.2 PyYAML==3.12 requests==2.11.1 +bz2file==0.98 \ No newline at end of file From 204c710aef7d46a0bafaf846411db8ad70ca424f Mon Sep 17 00:00:00 2001 From: Martijn van Exel Date: Sat, 29 Oct 2016 15:43:06 -0600 Subject: [PATCH 06/24] added print function from __future__ for Python 2 backwards compatibility --- changesetmd.py | 1 + 1 file changed, 1 insertion(+) diff --git a/changesetmd.py b/changesetmd.py index 6b33e21..aef8c97 100755 --- a/changesetmd.py +++ b/changesetmd.py @@ -6,6 +6,7 @@ @author: Toby Murray ''' +from __future__ import print_function import os import sys import argparse From 3b2ff2a0ee6bd0508c1ae0ef26f3cad7c300efad Mon Sep 17 00:00:00 2001 From: Martijn van Exel Date: Tue, 10 Jul 2018 15:59:30 -0600 Subject: [PATCH 07/24] updating requirements --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index c6e8b51..f70a796 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -lxml==3.6.4 -psycopg2==2.6.2 +lxml==4.2.3 +psycopg2-binary==2.7.5 PyYAML==3.12 requests==2.11.1 bz2file==0.98 \ No newline at end of file From 5a85bf153104cdb0cfa8cdcbf16d0eb671351872 Mon Sep 17 00:00:00 2001 From: Martijn van Exel Date: Thu, 7 Mar 2019 08:51:58 -0700 Subject: [PATCH 08/24] py3 optimizations --- .gitignore | 3 ++- changesetmd.py | 23 +++++++++-------------- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/.gitignore b/.gitignore index 7e99e36..0205d62 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -*.pyc \ No newline at end of file +*.pyc +.DS_Store diff --git a/changesetmd.py b/changesetmd.py index aef8c97..b840460 100755 --- a/changesetmd.py +++ b/changesetmd.py @@ -7,7 +7,6 @@ ''' from __future__ import print_function -import os import sys import argparse import psycopg2 @@ -78,13 +77,7 @@ def deleteExisting(self, connection, id): def parseFile(self, connection, changesetFile, doReplication): parsedCount = 0 startTime = datetime.now() - cursor = connection.cursor() - context = etree.iterparse(changesetFile) - if sys.version_info[0] < 3: - action, root = context.next() - else: - action, root = next(context) - for action, elem in context: + for event, elem in etree.iterparse(changesetFile): if(elem.tag != 'changeset'): continue @@ -128,14 +121,16 @@ def parseFile(self, connection, changesetFile, doReplication): print("parsed {:,}".format(parsedCount)) def fetchReplicationFile(self, sequenceNumber): - topdir = format(sequenceNumber / 1000000, '003') - subdir = format((sequenceNumber / 1000) % 1000, '003') - fileNumber = format(sequenceNumber % 1000, '003') + sequenceNumber = str(sequenceNumber).zfill(9) + topdir = str(sequenceNumber)[:3] + subdir = str(sequenceNumber)[3:6] + fileNumber = str(sequenceNumber)[-3:] fileUrl = BASE_REPL_URL + topdir + '/' + subdir + '/' + fileNumber + '.osm.gz' print("opening replication file at " + fileUrl) - replicationFile = requests.get(fileUrl) - replicationData = replicationFile.text - return gzip.GzipFile(fileobj=replicationData) + replicationFile = requests.get(fileUrl, stream=True) + replicationData = replicationFile.raw + f = gzip.GzipFile(fileobj=replicationData) + return f def doReplication(self, connection): cursor = connection.cursor(cursor_factory=psycopg2.extras.DictCursor) From 147ed8b07098328df7f463c8e275717249c658e7 Mon Sep 17 00:00:00 2001 From: Martijn van Exel Date: Fri, 15 Mar 2019 10:42:39 -0600 Subject: [PATCH 09/24] http --> https --- changesetmd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changesetmd.py b/changesetmd.py index b840460..efb6433 100755 --- a/changesetmd.py +++ b/changesetmd.py @@ -25,7 +25,7 @@ except ImportError: bz2Support = False -BASE_REPL_URL = "http://planet.openstreetmap.org/replication/changesets/" +BASE_REPL_URL = "https://planet.openstreetmap.org/replication/changesets/" class ChangesetMD(): def __init__(self, createGeometry): From 29364a512a005602d9639b4222cb053169710f61 Mon Sep 17 00:00:00 2001 From: Martijn van Exel Date: Fri, 15 Mar 2019 10:42:39 -0600 Subject: [PATCH 10/24] http --> https --- changesetmd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changesetmd.py b/changesetmd.py index b840460..efb6433 100755 --- a/changesetmd.py +++ b/changesetmd.py @@ -25,7 +25,7 @@ except ImportError: bz2Support = False -BASE_REPL_URL = "http://planet.openstreetmap.org/replication/changesets/" +BASE_REPL_URL = "https://planet.openstreetmap.org/replication/changesets/" class ChangesetMD(): def __init__(self, createGeometry): From b7e7ea81d18d125faca609aa04064297014a6930 Mon Sep 17 00:00:00 2001 From: jlevente Date: Thu, 30 May 2019 10:37:27 -0400 Subject: [PATCH 11/24] speed up import with pyscopg2.extras.execute_batch --- changesetmd.py | 60 +++++++++++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 23 deletions(-) diff --git a/changesetmd.py b/changesetmd.py index 18d2e25..da74479 100755 --- a/changesetmd.py +++ b/changesetmd.py @@ -50,23 +50,28 @@ def createTables(self, connection): cursor.execute(queries.createGeometryColumn) connection.commit() - def insertNew(self, connection, id, userId, createdAt, minLat, maxLat, minLon, maxLon, closedAt, open, numChanges, userName, tags, comments): + def insertNewBatch(self, connection, data_arr): cursor = connection.cursor() if self.createGeometry: - cursor.execute('''INSERT into osm_changeset + sql = '''INSERT into osm_changeset (id, user_id, created_at, min_lat, max_lat, min_lon, max_lon, closed_at, open, num_changes, user_name, tags, geom) - values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,ST_SetSRID(ST_MakeEnvelope(%s,%s,%s,%s), 4326))''', - (id, userId, createdAt, minLat, maxLat, minLon, maxLon, closedAt, open, numChanges, userName, tags, minLon, minLat, maxLon, maxLat)) + values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,ST_SetSRID(ST_MakeEnvelope(%s,%s,%s,%s), 4326))''' + pyscopg2.extras.execute_batch(cursor, sql, data_arr) + cursor.close() else: - cursor.execute('''INSERT into osm_changeset + sql = '''INSERT into osm_changeset (id, user_id, created_at, min_lat, max_lat, min_lon, max_lon, closed_at, open, num_changes, user_name, tags) - values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)''', - (id, userId, createdAt, minLat, maxLat, minLon, maxLon, closedAt, open, numChanges, userName, tags)) - for comment in comments: - cursor.execute('''INSERT into osm_changeset_comment + values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)''' + psycopg2.extras.execute_batch(cursor, sql, data_arr) + cursor.close() + + def insertNewBatchComment(self, connection, comment_arr): + cursor=connection.cursor() + sql = '''INSERT into osm_changeset_comment (comment_changeset_id, comment_user_id, comment_user_name, comment_date, comment_text) - values (%s,%s,%s,%s,%s)''', - (id, comment['uid'], comment['user'], comment['date'], comment['text'])) + values (%s,%s,%s,%s,%s)''' + psycopg2.extras.execute_batch(cursor, sql, comment_arr) + cursor.close() def deleteExisting(self, connection, id): cursor = connection.cursor() @@ -81,6 +86,8 @@ def parseFile(self, connection, changesetFile, doReplication): cursor = connection.cursor() context = etree.iterparse(changesetFile) action, root = context.next() + changesets = [] + comments = [] for action, elem in context: if(elem.tag != 'changeset'): continue @@ -91,28 +98,32 @@ def parseFile(self, connection, changesetFile, doReplication): for tag in elem.iterchildren(tag='tag'): tags[tag.attrib['k']] = tag.attrib['v'] - comments = [] for discussion in elem.iterchildren(tag='discussion'): for commentElement in discussion.iterchildren(tag='comment'): - comment = dict() - comment['uid'] = commentElement.attrib.get('uid') - comment['user'] = commentElement.attrib.get('user') - comment['date'] = commentElement.attrib.get('date') for text in commentElement.iterchildren(tag='text'): - comment['text'] = text.text + text = text.text + comment = (elem.attrib['id'], ommentElement.attrib.get('uid'), commentElement.attrib.get('user'), commentElement.attrib.get('date'), text) comments.append(comment) if(doReplication): self.deleteExisting(connection, elem.attrib['id']) - self.insertNew(connection, elem.attrib['id'], elem.attrib.get('uid', None), - elem.attrib['created_at'], elem.attrib.get('min_lat', None), - elem.attrib.get('max_lat', None), elem.attrib.get('min_lon', None), - elem.attrib.get('max_lon', None),elem.attrib.get('closed_at', None), - elem.attrib.get('open', None), elem.attrib.get('num_changes', None), - elem.attrib.get('user', None), tags, comments) + if self.createGeometry: + id, user_id, created_at, min_lat, max_lat, min_lon, max_lon, closed_at, open, num_changes, user_name, tags, geom + changesets.append((elem.attrib['id'], elem.attrib.get('uid', None), elem.attrib['created_at'], elem.attrib.get('min_lat', None), + elem.attrib.get('max_lat', None), elem.attrib.get('min_lon', None), elem.attrib.get('max_lon', None), elem.attrib.get('closed_at', None), + elem.attrib.get('open', None), elem.attrib.get('num_changes', None), elem.attrib.get('user', None), tags,elem.attrib.get('min_lon', None), elem.attrib.get('min_lat', None), + elem.attrib.get('max_lon', None), elem.attrib.get('max_lat', None))) + else: + changesets.append((elem.attrib['id'], elem.attrib.get('uid', None), elem.attrib['created_at'], elem.attrib.get('min_lat', None), + elem.attrib.get('max_lat', None), elem.attrib.get('min_lon', None), elem.attrib.get('max_lon', None), elem.attrib.get('closed_at', None), + elem.attrib.get('open', None), elem.attrib.get('num_changes', None), elem.attrib.get('user', None), tags)) if((parsedCount % 10000) == 0): + self.insertNewBatch(connection, changesets) + self.insertNewBatchComment(connection, comments ) + changesets = [] + comments = [] print "parsed %s" % ('{:,}'.format(parsedCount)) print "cumulative rate: %s/sec" % '{:,.0f}'.format(parsedCount/timedelta.total_seconds(datetime.now() - startTime)) @@ -120,6 +131,9 @@ def parseFile(self, connection, changesetFile, doReplication): elem.clear() while elem.getprevious() is not None: del elem.getparent()[0] + # Update whatever is left, then commit + self.isertNewBatch(connection, changesets) + self.insertNewBatchComment(connection, comments) connection.commit() print "parsing complete" print "parsed {:,}".format(parsedCount) From 864024a609fd00b328d35388246da0db5240429c Mon Sep 17 00:00:00 2001 From: jlevente Date: Thu, 30 May 2019 10:38:17 -0400 Subject: [PATCH 12/24] print status every 100k --- changesetmd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changesetmd.py b/changesetmd.py index da74479..0167a68 100755 --- a/changesetmd.py +++ b/changesetmd.py @@ -119,7 +119,7 @@ def parseFile(self, connection, changesetFile, doReplication): elem.attrib.get('max_lat', None), elem.attrib.get('min_lon', None), elem.attrib.get('max_lon', None), elem.attrib.get('closed_at', None), elem.attrib.get('open', None), elem.attrib.get('num_changes', None), elem.attrib.get('user', None), tags)) - if((parsedCount % 10000) == 0): + if((parsedCount % 100000) == 0): self.insertNewBatch(connection, changesets) self.insertNewBatchComment(connection, comments ) changesets = [] From 72c1de194a1d69f719302e6f4b93973fed3bbe91 Mon Sep 17 00:00:00 2001 From: Gitea Date: Wed, 31 Jul 2019 18:41:15 -0400 Subject: [PATCH 13/24] fix type --- changesetmd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changesetmd.py b/changesetmd.py index 0167a68..2ac6d9b 100755 --- a/changesetmd.py +++ b/changesetmd.py @@ -102,7 +102,7 @@ def parseFile(self, connection, changesetFile, doReplication): for commentElement in discussion.iterchildren(tag='comment'): for text in commentElement.iterchildren(tag='text'): text = text.text - comment = (elem.attrib['id'], ommentElement.attrib.get('uid'), commentElement.attrib.get('user'), commentElement.attrib.get('date'), text) + comment = (elem.attrib['id'], commentElement.attrib.get('uid'), commentElement.attrib.get('user'), commentElement.attrib.get('date'), text) comments.append(comment) if(doReplication): From 42fb06d1fa5c72ec0a7794c339587682e945775b Mon Sep 17 00:00:00 2001 From: Gitea Date: Wed, 31 Jul 2019 18:51:29 -0400 Subject: [PATCH 14/24] delete extra line --- changesetmd.py | 1 - 1 file changed, 1 deletion(-) diff --git a/changesetmd.py b/changesetmd.py index 2ac6d9b..bf02e5d 100755 --- a/changesetmd.py +++ b/changesetmd.py @@ -109,7 +109,6 @@ def parseFile(self, connection, changesetFile, doReplication): self.deleteExisting(connection, elem.attrib['id']) if self.createGeometry: - id, user_id, created_at, min_lat, max_lat, min_lon, max_lon, closed_at, open, num_changes, user_name, tags, geom changesets.append((elem.attrib['id'], elem.attrib.get('uid', None), elem.attrib['created_at'], elem.attrib.get('min_lat', None), elem.attrib.get('max_lat', None), elem.attrib.get('min_lon', None), elem.attrib.get('max_lon', None), elem.attrib.get('closed_at', None), elem.attrib.get('open', None), elem.attrib.get('num_changes', None), elem.attrib.get('user', None), tags,elem.attrib.get('min_lon', None), elem.attrib.get('min_lat', None), From 3a25b95e0ce48654b014919c0bb4ef4d63dfb144 Mon Sep 17 00:00:00 2001 From: Gitea Date: Wed, 31 Jul 2019 19:01:53 -0400 Subject: [PATCH 15/24] fixed typp --- changesetmd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changesetmd.py b/changesetmd.py index bf02e5d..5e1f004 100755 --- a/changesetmd.py +++ b/changesetmd.py @@ -56,7 +56,7 @@ def insertNewBatch(self, connection, data_arr): sql = '''INSERT into osm_changeset (id, user_id, created_at, min_lat, max_lat, min_lon, max_lon, closed_at, open, num_changes, user_name, tags, geom) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,ST_SetSRID(ST_MakeEnvelope(%s,%s,%s,%s), 4326))''' - pyscopg2.extras.execute_batch(cursor, sql, data_arr) + psycopg2.extras.execute_batch(cursor, sql, data_arr) cursor.close() else: sql = '''INSERT into osm_changeset From 38054a56b716be3648db3ca8aca8e644652893ae Mon Sep 17 00:00:00 2001 From: Gitea Date: Thu, 1 Aug 2019 15:26:42 -0400 Subject: [PATCH 16/24] fixed typo --- changesetmd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changesetmd.py b/changesetmd.py index 5e1f004..0b22453 100755 --- a/changesetmd.py +++ b/changesetmd.py @@ -131,7 +131,7 @@ def parseFile(self, connection, changesetFile, doReplication): while elem.getprevious() is not None: del elem.getparent()[0] # Update whatever is left, then commit - self.isertNewBatch(connection, changesets) + self.insertNewBatch(connection, changesets) self.insertNewBatchComment(connection, comments) connection.commit() print "parsing complete" From a81cbd917962b6836bfd008d3d2a904aa39520fd Mon Sep 17 00:00:00 2001 From: Martijn van Exel Date: Tue, 3 Sep 2019 11:26:14 -0600 Subject: [PATCH 17/24] python 3 updates, organize imports --- changesetmd.py | 12 ++++++------ requirements.txt | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/changesetmd.py b/changesetmd.py index 305426b..e3c6bc4 100755 --- a/changesetmd.py +++ b/changesetmd.py @@ -9,15 +9,15 @@ from __future__ import print_function import sys import argparse +import gzip +from datetime import datetime +from datetime import timedelta import psycopg2 import psycopg2.extras import queries -import gzip import requests import yaml from lxml import etree -from datetime import datetime -from datetime import timedelta try: from bz2file import BZ2File @@ -84,7 +84,7 @@ def parseFile(self, connection, changesetFile, doReplication): startTime = datetime.now() cursor = connection.cursor() context = etree.iterparse(changesetFile) - action, root = context.next() + action, root = next(context) changesets = [] comments = [] for action, elem in context: @@ -122,8 +122,8 @@ def parseFile(self, connection, changesetFile, doReplication): self.insertNewBatchComment(connection, comments ) changesets = [] comments = [] - print "parsed %s" % ('{:,}'.format(parsedCount)) - print "cumulative rate: %s/sec" % '{:,.0f}'.format(parsedCount/timedelta.total_seconds(datetime.now() - startTime)) + print("parsed {}".format(('{:,}'.format(parsedCount)))) + print("cumulative rate: {}/sec".format('{:,.0f}'.format(parsedCount/timedelta.total_seconds(datetime.now() - startTime)))) #clear everything we don't need from memory to avoid leaking elem.clear() diff --git a/requirements.txt b/requirements.txt index f70a796..1f780f4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ lxml==4.2.3 psycopg2-binary==2.7.5 -PyYAML==3.12 +PyYAML==5.1.2 requests==2.11.1 bz2file==0.98 \ No newline at end of file From cac6cee96c1f6abcf91e3064b737e854f70f5bf1 Mon Sep 17 00:00:00 2001 From: Martijn van Exel Date: Tue, 3 Sep 2019 11:26:14 -0600 Subject: [PATCH 18/24] python 3 updates, organize imports, merging in improvements from jlevente --- changesetmd.py | 12 ++++++------ requirements.txt | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/changesetmd.py b/changesetmd.py index 305426b..e3c6bc4 100755 --- a/changesetmd.py +++ b/changesetmd.py @@ -9,15 +9,15 @@ from __future__ import print_function import sys import argparse +import gzip +from datetime import datetime +from datetime import timedelta import psycopg2 import psycopg2.extras import queries -import gzip import requests import yaml from lxml import etree -from datetime import datetime -from datetime import timedelta try: from bz2file import BZ2File @@ -84,7 +84,7 @@ def parseFile(self, connection, changesetFile, doReplication): startTime = datetime.now() cursor = connection.cursor() context = etree.iterparse(changesetFile) - action, root = context.next() + action, root = next(context) changesets = [] comments = [] for action, elem in context: @@ -122,8 +122,8 @@ def parseFile(self, connection, changesetFile, doReplication): self.insertNewBatchComment(connection, comments ) changesets = [] comments = [] - print "parsed %s" % ('{:,}'.format(parsedCount)) - print "cumulative rate: %s/sec" % '{:,.0f}'.format(parsedCount/timedelta.total_seconds(datetime.now() - startTime)) + print("parsed {}".format(('{:,}'.format(parsedCount)))) + print("cumulative rate: {}/sec".format('{:,.0f}'.format(parsedCount/timedelta.total_seconds(datetime.now() - startTime)))) #clear everything we don't need from memory to avoid leaking elem.clear() diff --git a/requirements.txt b/requirements.txt index f70a796..1f780f4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ lxml==4.2.3 psycopg2-binary==2.7.5 -PyYAML==3.12 +PyYAML==5.1.2 requests==2.11.1 bz2file==0.98 \ No newline at end of file From 79666d0cf6dcb349ea0b218e7ddd8429eff11706 Mon Sep 17 00:00:00 2001 From: Martijn van Exel Date: Sun, 11 Sep 2022 09:04:38 -0600 Subject: [PATCH 19/24] formatting, deprecating Python 2 --- .gitignore | 2 + README.md | 2 +- changesetmd.py | 323 +++++++++++++++++++++++++++++++++-------------- requirements.txt | 8 +- 4 files changed, 238 insertions(+), 97 deletions(-) diff --git a/.gitignore b/.gitignore index 0205d62..2a7fe04 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ *.pyc .DS_Store +venv/ +__pycache__/ \ No newline at end of file diff --git a/README.md b/README.md index 1b05cd7..c253d6b 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ It can also keep a database created with a weekly dump file up to date using min Setup ------------ -ChangesetMD works with python 2.7 and Python 3.5. +ChangesetMD works with Python 3.6 or newer. Aside from postgresql, ChangesetMD depends on the python libraries psycopg2 and lxml. On Debian-based systems this means installing the python-psycopg2 and python-lxml packages. diff --git a/changesetmd.py b/changesetmd.py index e3c6bc4..dd62587 100755 --- a/changesetmd.py +++ b/changesetmd.py @@ -1,12 +1,11 @@ #!/usr/bin/env python -''' +""" ChangesetMD is a simple XML parser to read the weekly changeset metadata dumps from OpenStreetmap into a postgres database for querying. @author: Toby Murray -''' +""" -from __future__ import print_function import sys import argparse import gzip @@ -21,27 +20,31 @@ try: from bz2file import BZ2File + bz2Support = True except ImportError: bz2Support = False BASE_REPL_URL = "https://planet.openstreetmap.org/replication/changesets/" -class ChangesetMD(): + +class ChangesetMD: def __init__(self, createGeometry): self.createGeometry = createGeometry def truncateTables(self, connection): - print('truncating tables') + print("truncating tables") cursor = connection.cursor() cursor.execute("TRUNCATE TABLE osm_changeset_comment CASCADE;") cursor.execute("TRUNCATE TABLE osm_changeset CASCADE;") cursor.execute(queries.dropIndexes) - cursor.execute("UPDATE osm_changeset_state set last_sequence = -1, last_timestamp = null, update_in_progress = 0") + cursor.execute( + "UPDATE osm_changeset_state set last_sequence = -1, last_timestamp = null, update_in_progress = 0" + ) connection.commit() def createTables(self, connection): - print('creating tables') + print("creating tables") cursor = connection.cursor() cursor.execute(queries.createChangesetTable) cursor.execute(queries.initStateTable) @@ -52,32 +55,38 @@ def createTables(self, connection): def insertNewBatch(self, connection, data_arr): cursor = connection.cursor() if self.createGeometry: - sql = '''INSERT into osm_changeset + sql = """INSERT into osm_changeset (id, user_id, created_at, min_lat, max_lat, min_lon, max_lon, closed_at, open, num_changes, user_name, tags, geom) - values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,ST_SetSRID(ST_MakeEnvelope(%s,%s,%s,%s), 4326))''' + values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,ST_SetSRID(ST_MakeEnvelope(%s,%s,%s,%s), 4326))""" psycopg2.extras.execute_batch(cursor, sql, data_arr) cursor.close() else: - sql = '''INSERT into osm_changeset + sql = """INSERT into osm_changeset (id, user_id, created_at, min_lat, max_lat, min_lon, max_lon, closed_at, open, num_changes, user_name, tags) - values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)''' + values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""" psycopg2.extras.execute_batch(cursor, sql, data_arr) cursor.close() def insertNewBatchComment(self, connection, comment_arr): - cursor=connection.cursor() - sql = '''INSERT into osm_changeset_comment + cursor = connection.cursor() + sql = """INSERT into osm_changeset_comment (comment_changeset_id, comment_user_id, comment_user_name, comment_date, comment_text) - values (%s,%s,%s,%s,%s)''' + values (%s,%s,%s,%s,%s)""" psycopg2.extras.execute_batch(cursor, sql, comment_arr) cursor.close() def deleteExisting(self, connection, id): cursor = connection.cursor() - cursor.execute('''DELETE FROM osm_changeset_comment - WHERE comment_changeset_id = %s''', (id,)) - cursor.execute('''DELETE FROM osm_changeset - WHERE id = %s''', (id,)) + cursor.execute( + """DELETE FROM osm_changeset_comment + WHERE comment_changeset_id = %s""", + (id,), + ) + cursor.execute( + """DELETE FROM osm_changeset + WHERE id = %s""", + (id,), + ) def parseFile(self, connection, changesetFile, doReplication): parsedCount = 0 @@ -88,44 +97,86 @@ def parseFile(self, connection, changesetFile, doReplication): changesets = [] comments = [] for action, elem in context: - if(elem.tag != 'changeset'): + if elem.tag != "changeset": continue parsedCount += 1 tags = {} - for tag in elem.iterchildren(tag='tag'): - tags[tag.attrib['k']] = tag.attrib['v'] - - for discussion in elem.iterchildren(tag='discussion'): - for commentElement in discussion.iterchildren(tag='comment'): - for text in commentElement.iterchildren(tag='text'): - text = text.text - comment = (elem.attrib['id'], commentElement.attrib.get('uid'), commentElement.attrib.get('user'), commentElement.attrib.get('date'), text) + for tag in elem.iterchildren(tag="tag"): + tags[tag.attrib["k"]] = tag.attrib["v"] + + for discussion in elem.iterchildren(tag="discussion"): + for commentElement in discussion.iterchildren(tag="comment"): + for text in commentElement.iterchildren(tag="text"): + text = text.text + comment = ( + elem.attrib["id"], + commentElement.attrib.get("uid"), + commentElement.attrib.get("user"), + commentElement.attrib.get("date"), + text, + ) comments.append(comment) - if(doReplication): - self.deleteExisting(connection, elem.attrib['id']) + if doReplication: + self.deleteExisting(connection, elem.attrib["id"]) if self.createGeometry: - changesets.append((elem.attrib['id'], elem.attrib.get('uid', None), elem.attrib['created_at'], elem.attrib.get('min_lat', None), - elem.attrib.get('max_lat', None), elem.attrib.get('min_lon', None), elem.attrib.get('max_lon', None), elem.attrib.get('closed_at', None), - elem.attrib.get('open', None), elem.attrib.get('num_changes', None), elem.attrib.get('user', None), tags,elem.attrib.get('min_lon', None), elem.attrib.get('min_lat', None), - elem.attrib.get('max_lon', None), elem.attrib.get('max_lat', None))) + changesets.append( + ( + elem.attrib["id"], + elem.attrib.get("uid", None), + elem.attrib["created_at"], + elem.attrib.get("min_lat", None), + elem.attrib.get("max_lat", None), + elem.attrib.get("min_lon", None), + elem.attrib.get("max_lon", None), + elem.attrib.get("closed_at", None), + elem.attrib.get("open", None), + elem.attrib.get("num_changes", None), + elem.attrib.get("user", None), + tags, + elem.attrib.get("min_lon", None), + elem.attrib.get("min_lat", None), + elem.attrib.get("max_lon", None), + elem.attrib.get("max_lat", None), + ) + ) else: - changesets.append((elem.attrib['id'], elem.attrib.get('uid', None), elem.attrib['created_at'], elem.attrib.get('min_lat', None), - elem.attrib.get('max_lat', None), elem.attrib.get('min_lon', None), elem.attrib.get('max_lon', None), elem.attrib.get('closed_at', None), - elem.attrib.get('open', None), elem.attrib.get('num_changes', None), elem.attrib.get('user', None), tags)) - - if((parsedCount % 100000) == 0): + changesets.append( + ( + elem.attrib["id"], + elem.attrib.get("uid", None), + elem.attrib["created_at"], + elem.attrib.get("min_lat", None), + elem.attrib.get("max_lat", None), + elem.attrib.get("min_lon", None), + elem.attrib.get("max_lon", None), + elem.attrib.get("closed_at", None), + elem.attrib.get("open", None), + elem.attrib.get("num_changes", None), + elem.attrib.get("user", None), + tags, + ) + ) + + if (parsedCount % 100000) == 0: self.insertNewBatch(connection, changesets) - self.insertNewBatchComment(connection, comments ) + self.insertNewBatchComment(connection, comments) changesets = [] comments = [] - print("parsed {}".format(('{:,}'.format(parsedCount)))) - print("cumulative rate: {}/sec".format('{:,.0f}'.format(parsedCount/timedelta.total_seconds(datetime.now() - startTime)))) - - #clear everything we don't need from memory to avoid leaking + print("parsed {}".format(("{:,}".format(parsedCount)))) + print( + "cumulative rate: {}/sec".format( + "{:,.0f}".format( + parsedCount + / timedelta.total_seconds(datetime.now() - startTime) + ) + ) + ) + + # clear everything we don't need from memory to avoid leaking elem.clear() while elem.getprevious() is not None: del elem.getparent()[0] @@ -141,7 +192,7 @@ def fetchReplicationFile(self, sequenceNumber): topdir = str(sequenceNumber)[:3] subdir = str(sequenceNumber)[3:6] fileNumber = str(sequenceNumber)[-3:] - fileUrl = BASE_REPL_URL + topdir + '/' + subdir + '/' + fileNumber + '.osm.gz' + fileUrl = BASE_REPL_URL + topdir + "/" + subdir + "/" + fileNumber + ".osm.gz" print("opening replication file at " + fileUrl) replicationFile = requests.get(fileUrl, stream=True) replicationData = replicationFile.raw @@ -151,37 +202,41 @@ def fetchReplicationFile(self, sequenceNumber): def doReplication(self, connection): cursor = connection.cursor(cursor_factory=psycopg2.extras.DictCursor) try: - cursor.execute('LOCK TABLE osm_changeset_state IN ACCESS EXCLUSIVE MODE NOWAIT') + cursor.execute( + "LOCK TABLE osm_changeset_state IN ACCESS EXCLUSIVE MODE NOWAIT" + ) except psycopg2.OperationalError as e: print("error getting lock on state table. Another process might be running") return 1 - cursor.execute('select * from osm_changeset_state') + cursor.execute("select * from osm_changeset_state") dbStatus = cursor.fetchone() - lastDbSequence = dbStatus['last_sequence'] + lastDbSequence = dbStatus["last_sequence"] timestamp = None lastServerTimestamp = None newTimestamp = None - if(dbStatus['last_timestamp'] is not None): - timestamp = dbStatus['last_timestamp'] + if dbStatus["last_timestamp"] is not None: + timestamp = dbStatus["last_timestamp"] print("latest timestamp in database: " + str(timestamp)) - if(dbStatus['update_in_progress'] == 1): + if dbStatus["update_in_progress"] == 1: print("concurrent update in progress. Bailing out!") return 1 - if(lastDbSequence == -1): - print("replication state not initialized. You must set the sequence number first.") + if lastDbSequence == -1: + print( + "replication state not initialized. You must set the sequence number first." + ) return 1 - cursor.execute('update osm_changeset_state set update_in_progress = 1') + cursor.execute("update osm_changeset_state set update_in_progress = 1") connection.commit() print("latest sequence from the database: " + str(lastDbSequence)) - #No matter what happens after this point, execution needs to reach the update statement - #at the end of this method to unlock the database or an error will forever leave it locked + # No matter what happens after this point, execution needs to reach the update statement + # at the end of this method to unlock the database or an error will forever leave it locked returnStatus = 0 try: serverState = yaml.load(requests.get(BASE_REPL_URL + "state.yaml").text) - lastServerSequence = serverState['sequence'] + lastServerSequence = serverState["sequence"] print("got sequence") - lastServerTimestamp = serverState['last_run'] + lastServerTimestamp = serverState["last_run"] print("last timestamp on server: " + str(lastServerTimestamp)) except Exception as e: print("error retrieving server state file. Bailing on replication") @@ -190,12 +245,17 @@ def doReplication(self, connection): else: try: print("latest sequence on OSM server: " + str(lastServerSequence)) - if(lastServerSequence > lastDbSequence): + if lastServerSequence > lastDbSequence: print("server has new sequence. commencing replication") currentSequence = lastDbSequence + 1 - while(currentSequence <= lastServerSequence): - self.parseFile(connection, self.fetchReplicationFile(currentSequence), True) - cursor.execute('update osm_changeset_state set last_sequence = %s', (currentSequence,)) + while currentSequence <= lastServerSequence: + self.parseFile( + connection, self.fetchReplicationFile(currentSequence), True + ) + cursor.execute( + "update osm_changeset_state set last_sequence = %s", + (currentSequence,), + ) connection.commit() currentSequence += 1 timestamp = lastServerTimestamp @@ -204,31 +264,106 @@ def doReplication(self, connection): print("error during replication") print(e) returnStatus = 2 - cursor.execute('update osm_changeset_state set update_in_progress = 0, last_timestamp = %s', (timestamp,)) + cursor.execute( + "update osm_changeset_state set update_in_progress = 0, last_timestamp = %s", + (timestamp,), + ) connection.commit() return returnStatus -if __name__ == '__main__': + +if __name__ == "__main__": beginTime = datetime.now() endTime = None timeCost = None - argParser = argparse.ArgumentParser(description="Parse OSM Changeset metadata into a database") - argParser.add_argument('-t', '--trunc', action='store_true', default=False, dest='truncateTables', help='Truncate existing tables (also drops indexes)') - argParser.add_argument('-c', '--create', action='store_true', default=False, dest='createTables', help='Create tables') - argParser.add_argument('-H', '--host', action='store', dest='dbHost', help='Database hostname') - argParser.add_argument('-P', '--port', action='store', dest='dbPort', default=None, help='Database port') - argParser.add_argument('-u', '--user', action='store', dest='dbUser', default=None, help='Database username') - argParser.add_argument('-p', '--password', action='store', dest='dbPass', default=None, help='Database password') - argParser.add_argument('-d', '--database', action='store', dest='dbName', help='Target database', required=True) - argParser.add_argument('-f', '--file', action='store', dest='fileName', help='OSM changeset file to parse') - argParser.add_argument('-r', '--replicate', action='store_true', dest='doReplication', default=False, help='Apply a replication file to an existing database') - argParser.add_argument('-g', '--geometry', action='store_true', dest='createGeometry', default=False, help='Build geometry of changesets (requires postgis)') + argParser = argparse.ArgumentParser( + description="Parse OSM Changeset metadata into a database" + ) + argParser.add_argument( + "-t", + "--trunc", + action="store_true", + default=False, + dest="truncateTables", + help="Truncate existing tables (also drops indexes)", + ) + argParser.add_argument( + "-c", + "--create", + action="store_true", + default=False, + dest="createTables", + help="Create tables", + ) + argParser.add_argument( + "-H", "--host", action="store", dest="dbHost", help="Database hostname" + ) + argParser.add_argument( + "-P", + "--port", + action="store", + dest="dbPort", + default=None, + help="Database port", + ) + argParser.add_argument( + "-u", + "--user", + action="store", + dest="dbUser", + default=None, + help="Database username", + ) + argParser.add_argument( + "-p", + "--password", + action="store", + dest="dbPass", + default=None, + help="Database password", + ) + argParser.add_argument( + "-d", + "--database", + action="store", + dest="dbName", + help="Target database", + required=True, + ) + argParser.add_argument( + "-f", + "--file", + action="store", + dest="fileName", + help="OSM changeset file to parse", + ) + argParser.add_argument( + "-r", + "--replicate", + action="store_true", + dest="doReplication", + default=False, + help="Apply a replication file to an existing database", + ) + argParser.add_argument( + "-g", + "--geometry", + action="store_true", + dest="createGeometry", + default=False, + help="Build geometry of changesets (requires postgis)", + ) args = argParser.parse_args() - conn = psycopg2.connect(database=args.dbName, user=args.dbUser, password=args.dbPass, host=args.dbHost, port=args.dbPort) - + conn = psycopg2.connect( + database=args.dbName, + user=args.dbUser, + password=args.dbPass, + host=args.dbHost, + port=args.dbPort, + ) md = ChangesetMD(args.createGeometry) if args.truncateTables: @@ -239,39 +374,43 @@ def doReplication(self, connection): psycopg2.extras.register_hstore(conn) - if(args.doReplication): + if args.doReplication: returnStatus = md.doReplication(conn) sys.exit(returnStatus) if not (args.fileName is None): if args.createGeometry: - print('parsing changeset file with geometries') + print("parsing changeset file with geometries") else: - print('parsing changeset file') + print("parsing changeset file") changesetFile = None - if(args.doReplication): - changesetFile = gzip.open(args.fileName, 'rb') + if args.doReplication: + changesetFile = gzip.open(args.fileName, "rb") else: - if(args.fileName[-4:] == '.bz2'): - if(bz2Support): + if args.fileName[-4:] == ".bz2": + if bz2Support: changesetFile = BZ2File(args.fileName) else: - print('ERROR: bzip2 support not available. Unzip file first or install bz2file') + print( + "ERROR: bzip2 support not available. Unzip file first or install bz2file" + ) sys.exit(1) else: - changesetFile = open(args.fileName, 'rb') + changesetFile = open(args.fileName, "rb") - if(changesetFile != None): + if changesetFile != None: md.parseFile(conn, changesetFile, args.doReplication) else: - print('ERROR: no changeset file opened. Something went wrong in processing args') + print( + "ERROR: no changeset file opened. Something went wrong in processing args" + ) sys.exist(1) - if(not args.doReplication): + if not args.doReplication: cursor = conn.cursor() - print('creating constraints') + print("creating constraints") cursor.execute(queries.createConstraints) - print('creating indexes') + print("creating indexes") cursor.execute(queries.createIndexes) if args.createGeometry: cursor.execute(queries.createGeomIndex) @@ -282,6 +421,6 @@ def doReplication(self, connection): endTime = datetime.now() timeCost = endTime - beginTime - print('Processing time cost is ', timeCost) + print("Processing time cost is ", timeCost) - print('All done. Enjoy your (meta)data!') + print("All done. Enjoy your (meta)data!") diff --git a/requirements.txt b/requirements.txt index 1f780f4..d1b30e4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -lxml==4.2.3 -psycopg2-binary==2.7.5 -PyYAML==5.1.2 -requests==2.11.1 +lxml==4.9.1 +psycopg2-binary==2.9.3 +PyYAML==6.0 +requests==2.28.1 bz2file==0.98 \ No newline at end of file From 77924b90274e5aa001a380216090418973564ba4 Mon Sep 17 00:00:00 2001 From: Martijn van Exel Date: Mon, 30 Jan 2023 08:17:53 -0700 Subject: [PATCH 20/24] Create FUNDING.yml --- .github/FUNDING.yml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .github/FUNDING.yml diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 0000000..8210b1b --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1,6 @@ +# These are supported funding model platforms + +github: [mvexel] +patreon: mvexel +ko_fi: mvexel +liberapay: mvexel From 0a370f95b149712798b37ef2b5bba780b212c504 Mon Sep 17 00:00:00 2001 From: Andy Townsend Date: Thu, 29 Jun 2023 23:48:00 +0000 Subject: [PATCH 21/24] Changed "yaml.load" to "yaml.full_load" per https://stackoverflow.com/questions/69564817/typeerror-load-missing-1-required-positional-argument-loader-in-google-col --- changesetmd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changesetmd.py b/changesetmd.py index dd62587..4b90962 100755 --- a/changesetmd.py +++ b/changesetmd.py @@ -233,7 +233,7 @@ def doReplication(self, connection): # at the end of this method to unlock the database or an error will forever leave it locked returnStatus = 0 try: - serverState = yaml.load(requests.get(BASE_REPL_URL + "state.yaml").text) + serverState = yaml.full_load(requests.get(BASE_REPL_URL + "state.yaml").text) lastServerSequence = serverState["sequence"] print("got sequence") lastServerTimestamp = serverState["last_run"] From 4142e0f40e681cfac5dc2beb9d76c13f2ae7f646 Mon Sep 17 00:00:00 2001 From: Andy Townsend Date: Sat, 8 Jul 2023 01:42:52 +0100 Subject: [PATCH 22/24] Updated README to contain a Debian summary section. --- README.md | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/README.md b/README.md index c253d6b..0c5e871 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,28 @@ It is easiest if your OS user has access to this database. I just created a user createuser +Full Debian build instructions +------------------------------ + + sudo apt install sudo screen locate git tar unzip wget bzip2 apache2 python3-psycopg2 python3-yaml libpq-dev postgresql postgresql-contrib postgis postgresql-15-postgis-3 postgresql-15-postgis-3-scripts net-tools curl python3-full gcc libpython3.11-dev libxml2-dev libxslt-dev + + python3 -m venv .venv + source .venv/bin/activate + pip install -r requirements.txt + + sudo -u postgres -i + createuser youruseraccount + createdb -E UTF8 -O youruseraccount changesets + + psql + \c changesets + CREATE EXTENSION postgis; + ALTER TABLE geometry_columns OWNER TO youruseraccount; + ALTER TABLE spatial_ref_sys OWNER TO youruseraccount; + \q + exit + + Execution ------------ The first time you run it, you will need to include the -c | --create option to create the table: From 711fbc090588c4c12e444443ecd01a34d862aa9e Mon Sep 17 00:00:00 2001 From: Andy Townsend Date: Thu, 5 Oct 2023 21:15:05 +0100 Subject: [PATCH 23/24] Consolidate readme changes from replication_changes_01 branch. --- README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 0c5e871..0df1348 100644 --- a/README.md +++ b/README.md @@ -54,17 +54,19 @@ Execution ------------ The first time you run it, you will need to include the -c | --create option to create the table: - python changesetmd.py -d -c + python changesetmd.py -d -c -g + +The `-g` | `--geometry` argument is optional and builds polygon geometries for changesets so that you can query which changesets were within which areas. The create function can be combined with the file option to immediately parse a file. To parse a dump file, use the -f | --file option. - python changesetmd.py -d -f /tmp/changeset-latest.osm + python changesetmd.py -d -g -f /tmp/discussions-latest.osm.bz2 If no other arguments are given, it will access postgres using the default settings of the postgres client, typically connecting on the unix socket as the current OS user. Use the ```--help``` argument to see optional arguments for connecting to postgres. -You can add the `-g` | `--geometry` option to build polygon geometries (the database also needs to be created with this option). +Again, the `-g` | `--geometry` argument is optional. Either of changeset-latest.osm.bz2 or discussions-latest.osm.bz2 or neither can be used to populate the database. Replication ------------ From ce41965dd9637f4f085e779a5b257cdea23328ee Mon Sep 17 00:00:00 2001 From: Martijn van Exel Date: Mon, 13 Nov 2023 12:10:03 -0700 Subject: [PATCH 24/24] chores: move license statement to LICENSE file Update requirements --- LICENSE | 14 ++++++++++++++ README.md | 10 ---------- requirements.txt | 11 ++++++----- 3 files changed, 20 insertions(+), 15 deletions(-) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..1dae9ab --- /dev/null +++ b/LICENSE @@ -0,0 +1,14 @@ +Copyright (C) 2012 Toby Murray + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . \ No newline at end of file diff --git a/README.md b/README.md index 0df1348..3703eb0 100644 --- a/README.md +++ b/README.md @@ -148,13 +148,3 @@ Find all changesets that were created in Liberty Island: SELECT count(id) FROM osm_changeset c, (SELECT ST_SetSRID(ST_MakeEnvelope(-74.0474545,40.6884971,-74.0433990,40.6911817),4326) AS geom) s WHERE ST_CoveredBy(c.geom, s.geom); - -License ------------- -Copyright (C) 2012 Toby Murray - -This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - -See the GNU Affero General Public License for more details: http://www.gnu.org/licenses/agpl.txt diff --git a/requirements.txt b/requirements.txt index d1b30e4..1904ef7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ -lxml==4.9.1 -psycopg2-binary==2.9.3 -PyYAML==6.0 -requests==2.28.1 -bz2file==0.98 \ No newline at end of file +bz2file==0.98 +lxml==4.9.3 +psycopg2-binary==2.9.9 +PyYAML==6.0.1 +requests==2.31.0 +urllib3==1.26.18 \ No newline at end of file