From f1df650ffe4e753be31b9cb8cb6705b996fc01dd Mon Sep 17 00:00:00 2001 From: Jeff Kehler Date: Sat, 4 Oct 2014 22:54:23 +0700 Subject: [PATCH] Fix issue with html pages containing declarations --- goose/parsers.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/goose/parsers.py b/goose/parsers.py index a43e9b47..f5374976 100644 --- a/goose/parsers.py +++ b/goose/parsers.py @@ -26,6 +26,7 @@ from copy import deepcopy from goose.text import innerTrim from goose.text import encodeValue +import re class Parser(object): @@ -51,7 +52,12 @@ def css_select(self, node, selector): @classmethod def fromstring(self, html): html = encodeValue(html) + + # remove tag because it breaks the lxml html parser + html = re.sub(r'<\?xml version\=[\"\'][0-9]\.[0-9][\"\'] encoding\=(.*?)\?>', '', html) + self.doc = lxml.html.fromstring(html) + return self.doc @classmethod