-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtextExtracter.py
More file actions
71 lines (55 loc) · 2.53 KB
/
textExtracter.py
File metadata and controls
71 lines (55 loc) · 2.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from bs4 import BeautifulSoup
import os
from datetime import datetime
def clean_text(text):
"""Removes extra whitespace and unnecessary line breaks."""
return ' '.join(text.split()).strip()
def extract_article_content(html_content):
"""Extracts main article content from an HTML file."""
soup = BeautifulSoup(html_content, 'html.parser')
# Remove unnecessary elements (scripts, styles, navigation, etc.)
for unwanted in soup.find_all(['script', 'style', 'nav', 'header', 'footer', 'aside']):
unwanted.decompose()
# Extract text from common content containers
article_content = []
# Attempt to locate the main content div
content_div = soup.find('div', class_='content-offset')
if content_div:
paragraphs = content_div.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
for p in paragraphs:
text = clean_text(p.get_text())
if text:
article_content.append(text)
# If no content was found, extract all text as fallback
if not article_content:
all_text = clean_text(soup.get_text())
article_content.append(all_text)
return '\n\n'.join(article_content)
# Directories for source HTML and extracted text
html_base_dir = 'downloaded_html'
text_base_dir = 'articles'
# Use today's date for locating HTML files and saving extracted text
current_date = datetime.now().strftime("%Y-%m-%d")
html_dir = os.path.join(html_base_dir, current_date)
text_dir = os.path.join(text_base_dir, current_date)
# Ensure output directory exists
os.makedirs(text_dir, exist_ok=True)
# Process each HTML file in today's folder
if os.path.exists(html_dir):
for filename in os.listdir(html_dir):
if filename.endswith('.html'):
print(f"📄 Processing {filename}...")
# Read the HTML file
html_path = os.path.join(html_dir, filename)
with open(html_path, 'r', encoding='utf-8') as f:
html_content = f.read()
# Extract text
article_text = extract_article_content(html_content)
# Save extracted text as .txt
text_filename = filename.replace('.html', '.txt')
text_path = os.path.join(text_dir, text_filename)
with open(text_path, 'w', encoding='utf-8') as f:
f.write(article_text)
print(f"Text extraction complete! Files saved in: {text_dir}")
else:
print(f"No HTML files found in {html_dir}. Check if the HTML files exist.")