-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocess.py
More file actions
120 lines (100 loc) · 3.33 KB
/
preprocess.py
File metadata and controls
120 lines (100 loc) · 3.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import re
import unicodedata
from dataclasses import dataclass
from typing import List
_QUOTE_TRANSLATION = str.maketrans(
{
"\u2018": "'",
"\u2019": "'",
"\u201c": '"',
"\u201d": '"',
"\u2013": "-",
"\u2014": "-",
"\u2212": "-",
}
)
_ZERO_WIDTH = re.compile(r"[\u200b\u200c\u200d\ufeff]")
_WHITESPACE = re.compile(r"[ \t]+")
_MULTI_NEWLINE = re.compile(r"\n{3,}")
_URL_PATTERN = re.compile(r"https?://\S+|www\.\S+", re.IGNORECASE)
_NUMBER_PATTERN = re.compile(r"\b\d+(?:[\.,]\d+)?\b")
_MARKDOWN_SYNTAX = re.compile(r"[*_`~>#\[\]\(\)\-]{1,}")
_CODE_BLOCK = re.compile(r"```.*?```|`[^`\n]+`", re.DOTALL)
_BLOCKQUOTE_LINE = re.compile(r"^\s*>\s?.*$", re.MULTILINE)
_HEADING_LINE = re.compile(r"^\s{0,3}#{1,6}\s+.*$", re.MULTILINE)
_SENTENCE_SPLIT = re.compile(r"(?<=[.!?])\s+(?=[A-Z0-9\"'])")
@dataclass
class PreprocessOptions:
ignore_blockquotes: bool = False
ignore_code_blocks: bool = False
ignore_urls: bool = False
ignore_numbers: bool = False
strip_headings: bool = False
strip_markdown: bool = False
@dataclass
class ProcessedText:
raw_input: str
normalized_text: str
style_text: str
sentences: List[str]
paragraphs: List[str]
word_count: int
line_count: int
def _normalize_common(text: str) -> str:
text = unicodedata.normalize("NFKC", text)
text = text.translate(_QUOTE_TRANSLATION)
text = _ZERO_WIDTH.sub("", text)
text = text.replace("\r\n", "\n").replace("\r", "\n")
text = _WHITESPACE.sub(" ", text)
text = _MULTI_NEWLINE.sub("\n\n", text)
return text.strip()
def _apply_stripping(text: str, options: PreprocessOptions) -> str:
value = text
if options.ignore_code_blocks:
value = _CODE_BLOCK.sub(" ", value)
if options.ignore_blockquotes:
value = _BLOCKQUOTE_LINE.sub(" ", value)
if options.strip_headings:
value = _HEADING_LINE.sub(" ", value)
if options.ignore_urls:
value = _URL_PATTERN.sub(" ", value)
if options.ignore_numbers:
value = _NUMBER_PATTERN.sub(" ", value)
return value
def split_sentences(text: str) -> List[str]:
if not text:
return []
rough = _SENTENCE_SPLIT.split(text)
out: List[str] = []
for part in rough:
part = part.strip()
if part:
out.append(part)
return out
def split_paragraphs(text: str) -> List[str]:
if not text:
return []
parts = [p.strip() for p in text.split("\n\n")]
return [p for p in parts if p]
def preprocess_text(text: str, options: PreprocessOptions) -> ProcessedText:
normalized = _normalize_common(text)
stripped = _apply_stripping(normalized, options)
stripped = _normalize_common(stripped)
style_text = stripped
clean_text = stripped
if options.strip_markdown:
clean_text = _MARKDOWN_SYNTAX.sub(" ", clean_text)
clean_text = _normalize_common(clean_text)
sentences = split_sentences(clean_text)
paragraphs = split_paragraphs(clean_text)
words = re.findall(r"\b\w+\b", clean_text)
lines = [ln for ln in style_text.split("\n") if ln.strip()]
return ProcessedText(
raw_input=text,
normalized_text=normalized,
style_text=style_text,
sentences=sentences,
paragraphs=paragraphs,
word_count=len(words),
line_count=len(lines),
)