-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathents.py
More file actions
42 lines (36 loc) · 1.15 KB
/
ents.py
File metadata and controls
42 lines (36 loc) · 1.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
"""ents.pty
Convert SGML character entities into Unicode
Function taken from:
http://stackoverflow.com/questions/1197981/convert-html-entities-to-ascii-in-python/1582036#1582036
Thanks agazso!
"""
import re
import htmlentitydefs
def convert(s):
"""Take an input string s, find all things that look like SGML character
entities, and replace them with the Unicode equivalent.
Function is from:
http://stackoverflow.com/questions/1197981/convert-html-entities-to-ascii-in-python/1582036#1582036
"""
matches = re.findall("&#\d+;", s)
if len(matches) > 0:
hits = set(matches)
for hit in hits:
name = hit[2:-1]
try:
entnum = int(name)
s = s.replace(hit, unichr(entnum))
except ValueError:
pass
matches = re.findall("&\w+;", s)
hits = set(matches)
amp = "&"
if amp in hits:
hits.remove(amp)
for hit in hits:
name = hit[1:-1]
if name in htmlentitydefs.name2codepoint:
s = s.replace(hit,
unichr(htmlentitydefs.name2codepoint[name]))
s = s.replace(amp, "&")
return s