-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathAcquireData.py
More file actions
48 lines (36 loc) · 1.24 KB
/
AcquireData.py
File metadata and controls
48 lines (36 loc) · 1.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
"""Functions to acquire data by:
1) downloading files
2) parsing xml
"""
__all__ = ['download_url', 'download_url_to_file', 'get_xml']
__version__ = '0.1'
__author__ = "Steven Smith"
from bs4 import BeautifulSoup
import os
import requests
def download_url(url):
"""Downloads a url and returns a Response object.
NOTE: The arXiv API requires a user-agent or rejects the request.
:param url: url address to retrieve
:return: a Response object
"""
return requests.get(url, headers={'user-agent': 'Lynx'})
def download_url_to_file(url, filepath, overwrite_file=True):
"""Download data from a url and save to file.
:param url: url address to retrieve
:param filepath: file path to save the data to
:param overwrite_file: if a file exists should it be overwritten?
:return: a Response object
"""
if overwrite_file or not os.path.exists(filepath):
r = download_url(url)
with open(filepath, 'wb') as f:
f.write(r.content)
return r
def get_xml(xml_filename):
"""Parse xml from a file.
:param xml_filename: the xml filename to parse
:return: a BeautifulSoup xml object
"""
with open(xml_filename, encoding='utf8') as fp:
return BeautifulSoup(fp, 'lxml-xml')