-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcontent_processor.py
More file actions
40 lines (32 loc) · 1.54 KB
/
content_processor.py
File metadata and controls
40 lines (32 loc) · 1.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from bs4 import BeautifulSoup
class ContentProcessor:
def extract_text_from_html(self, html_content: str) -> str:
"""
Extracts all text from HTML content, removing script and style tags.
"""
soup = BeautifulSoup(html_content, 'html.parser')
# Remove script and style tags
for script_or_style in soup(["script", "style"]):
script_or_style.decompose()
# Get text using stripped_strings to remove extra whitespace and join with spaces
text_pieces = [text for text in soup.stripped_strings]
cleaned_text = " ".join(text_pieces)
return cleaned_text
def extract_main_content(self, html_content: str) -> str:
"""
Extracts the main content from HTML.
Currently, this is a simplified version that extracts all text.
"""
# Future: Enhance with more advanced techniques (e.g., using libraries
# like newspaper3k or readability-lxml) for better main content identification.
return self.extract_text_from_html(html_content)
def process_content(self, html_content: str) -> dict:
"""
Processes HTML content to extract text.
The returned dictionary can be expanded with more NLP features in the future.
"""
extracted_text = self.extract_main_content(html_content)
# Future: Add more keys like "summary", "topics", "entities"
# from NLP processing steps.
return {"processed_text": extracted_text}
# For a usage demonstration, see examples/content_processor_demo.py