-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcontext_analyzer.py
More file actions
137 lines (113 loc) · 6.2 KB
/
context_analyzer.py
File metadata and controls
137 lines (113 loc) · 6.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import re
# Basic regex patterns - these can be significantly improved for accuracy and internationalization.
# Regex for emails: Handles most common email formats.
# Allows for longer TLDs and stricter local part.
EMAIL_REGEX = r"[a-zA-Z0-9!#$%&'*+\-/=?^_`{|}~.;]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,20}"
# Regex for URLs: Improved to better capture various URL forms, including those without http/s but with www,
# and basic path/query/fragment. Still not exhaustive for all edge cases like IP addresses or ports.
URL_REGEX = r"(?:(?:https?|ftp):\/\/|www\.)[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&//=]*)"
# Regex for phone numbers: Expanded to include more variations, still primarily focused on common structures.
# This is notoriously difficult. For truly robust parsing, a dedicated library is better.
# This pattern attempts to match:
# - Optional international prefix like +1, +49, 0049
# - Optional area codes in parentheses
# - Various separators: space, dash, dot
# - Common lengths for local numbers (e.g., 7 to 10 digits after area code)
PHONE_REGEX = r"(?:\+?\d{1,4}[-.\s]?)?(?:\(?\d{1,5}\)?[-.\s]?)?\d{3,5}[-.\s]?\d{3,5}(?:[-.\s]?\d{1,5})?"
# Add a more specific German phone number pattern to catch those first.
# Example: 0171 1234567 or +49 30 1234567 or (030) 123456-78
GERMAN_PHONE_REGEX = r"(?:(?:\(?(?:0049|0)\)?\s?\d{1,5})|(?:\+49\s?\d{1,5}))(?:(?:[\s.-]?\d{2,}){2,}|[\s.-]?\d{4,12})"
# Regex for dates: Expanded for more common numeric formats and separators.
# Still does not validate day/month ranges.
# Catches YYYY-MM-DD, YYYY/MM/DD, YYYY.MM.DD
# Catches DD.MM.YYYY, DD/MM/YYYY, DD-MM-YYYY
# Catches MM/DD/YYYY (common US)
DATE_REGEX = r"""
\b(
\d{4}[-/.]\d{1,2}[-/.]\d{1,2} | # YYYY-MM-DD and variations
\d{1,2}[-/.]\d{1,2}[-/.]\d{4} # DD.MM.YYYY and MM/DD/YYYY and variations
)\b
"""
def find_emails(text: str) -> list[str]:
"""Finds all email addresses in the given text."""
if not text: return []
return list(set(re.findall(EMAIL_REGEX, text, re.IGNORECASE))) # Use set to get unique emails
def find_urls(text: str) -> list[str]:
"""Finds all URLs in the given text."""
if not text: return []
# A common issue is OCR splitting URLs with spaces if they wrap lines.
# This regex won't handle that; pre-processing of OCR text might be needed.
return list(set(re.findall(URL_REGEX, text, re.IGNORECASE)))
def find_phone_numbers(text: str) -> list[str]:
"""Finds all phone numbers (based on improved patterns) in the given text."""
if not text: return []
# Try German specific first, then the more general one.
# Use a set to avoid duplicates if patterns overlap.
found_numbers = set()
# German numbers might have spaces that the general regex splits, so clean them.
# For German regex, remove spaces within potential numbers before matching.
# This is a heuristic.
text_no_space_for_german_phones = re.sub(r"(\d)\s+(\d)", r"\1\2", text)
german_matches = re.findall(GERMAN_PHONE_REGEX, text_no_space_for_german_phones)
for num in german_matches:
# Basic cleanup: remove common separators to somewhat normalize
cleaned_num = re.sub(r"[\s().\-/]", "", num)
if len(cleaned_num) >= 7: # Arbitrary minimum length for a "valid" cleaned number
found_numbers.add(num.strip()) # Add original formatting
general_matches = re.findall(PHONE_REGEX, text)
for num in general_matches:
cleaned_num = re.sub(r"[\s().\-/]", "", num)
if len(cleaned_num) >= 7:
found_numbers.add(num.strip())
return sorted(list(found_numbers), key=len, reverse=True) # Prefer longer matches
def find_dates(text: str) -> list[str]:
"""Finds all dates (based on improved patterns) in the given text."""
if not text: return []
# The re.VERBOSE flag allows for comments and cleaner layout in the DATE_REGEX.
return list(set(re.findall(DATE_REGEX, text, re.VERBOSE)))
def analyze_text_for_contextual_items(text: str) -> dict:
"""
Analyzes text and returns a dictionary of found items.
Keys are item types (e.g., 'emails', 'urls'), values are lists of found strings.
"""
if not text or not text.strip():
return {}
found_items = {
"emails": find_emails(text),
"urls": find_urls(text),
"phones": find_phone_numbers(text),
"dates": find_dates(text),
}
# Filter out empty lists
return {k: v for k, v in found_items.items() if v}
if __name__ == '__main__':
sample_text_1 = """
Hello, please contact me at test.user@example.com or my_other.email@sub.example.co.uk.
Visit our website http://www.example.com or https://another-example.com/path?query=true.
Call me on (123) 456-7890 or +49 123 45678. My old number was 555-123-4567.
Important dates: 2023-12-25, 01/15/2024, and 31.07.2023. Not a date: 1234.
This is just a regular number 12345 and a price $19.99.
"""
sample_text_2 = "No special items here. Just plain text."
sample_text_3 = "My website is www.test-site.com."
print("--- Analyzing Sample Text 1 ---")
items1 = analyze_text_for_contextual_items(sample_text_1)
for item_type, values in items1.items():
print(f"{item_type.capitalize()}: {values}")
print("\n--- Analyzing Sample Text 2 ---")
items2 = analyze_text_for_contextual_items(sample_text_2)
if not items2:
print("No contextual items found.")
else:
for item_type, values in items2.items():
print(f"{item_type.capitalize()}: {values}")
print("\n--- Analyzing Sample Text 3 (www only) ---")
items3 = analyze_text_for_contextual_items(sample_text_3)
for item_type, values in items3.items():
print(f"{item_type.capitalize()}: {values}")
print("\n--- Testing individual finders ---")
print(f"Emails in 'contact support@example.org': {find_emails('contact support@example.org')}")
print(f"URLs in 'go to example.com': {find_urls('go to example.com')}") # Will not find without http/www by current regex
print(f"URLs in 'go to www.example.com': {find_urls('go to www.example.com')}")
print(f"Phones in 'Tel: 1234567890': {find_phone_numbers('Tel: 1234567890')}")
print(f"Dates in 'Meeting on 10/20/2025': {find_dates('Meeting on 10/20/2025')}")