-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtitle_extractor.py
More file actions
59 lines (47 loc) · 1.51 KB
/
title_extractor.py
File metadata and controls
59 lines (47 loc) · 1.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
"""
Title Extraction Module
Extracts article titles from HTML metadata, markdown headings, or URL fallback.
"""
import re
from urllib.parse import urlparse
try:
import trafilatura
except Exception: # pragma: no cover
trafilatura = None
def extract_title_from_metadata(html: str, url: str) -> str | None:
"""
Extract title from page metadata using trafilatura.
Returns:
The title from metadata, or None if not found
"""
if trafilatura is None:
return None
try:
metadata = trafilatura.extract_metadata(html, default_url=url)
if metadata and metadata.title:
return metadata.title.strip()
except Exception:
pass
return None
def extract_h1_title(markdown: str) -> str | None:
"""
Extract the first H1 heading from markdown content.
Returns:
The title text without the # prefix, or None if no H1 found
"""
for line in markdown.splitlines():
match = re.match(r"^\s*#\s+(.+?)\s*$", line)
if match:
return match.group(1)
return None
def fallback_name_from_url(original_url: str) -> str:
"""
Generate a filename from the URL structure when no title is found.
"""
# Ensure URL has a scheme for parsing
if "://" not in original_url:
original_url = "https://" + original_url
parsed = urlparse(original_url)
base = (parsed.netloc + parsed.path).strip("/").replace("/", " - ")
base = re.sub(r"[^A-Za-z0-9._ -]+", "", base).strip()
return base or "page"