From 0e5481ab59a41fb06b41be581694ebf34efeff2b Mon Sep 17 00:00:00 2001 From: Gaurav Arora Date: Mon, 4 Aug 2014 15:15:10 +0530 Subject: [PATCH] sperating paragraph with new line Clean text looks clumsy as paragraphs are not seprates, add a new line for each paragraph --- goose/outputformatters.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/goose/outputformatters.py b/goose/outputformatters.py index ae42457b..01f25966 100644 --- a/goose/outputformatters.py +++ b/goose/outputformatters.py @@ -62,6 +62,7 @@ def get_formatted_text(self): self.remove_negativescores_nodes() self.links_to_text() self.add_newline_to_br() + self.add_newline_to_paragraph() self.replace_with_text() self.remove_fewwords_paragraphs() return self.convert_to_text() @@ -80,6 +81,14 @@ def add_newline_to_br(self): for e in self.parser.getElementsByTag(self.top_node, tag='br'): e.text = r'\n' + def add_newline_to_paragraph(self): + for e in self.parser.getElementsByTag(self.top_node, tag='p'): + if e.text is not None: + e.text = e.text+r'\n' + else : + e.text = r'\n' + + def links_to_text(self): """\ cleans up and converts any nodes that