-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathutils.py
More file actions
104 lines (84 loc) · 3.21 KB
/
utils.py
File metadata and controls
104 lines (84 loc) · 3.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import re
import cn2an # 将中文大写金额转换为阿拉伯数字
from datetime import datetime
def clean_text(text):
return re.sub(r"\s+", " ", text).strip()
# 清理ocr扫描后的文本
def clean_ocr_text(text):
lines = text.splitlines()
cleaned = []
for line in lines:
line = line.strip()
# 去除乱码符号(Â、�、‰等)
line = re.sub(r"[^\u4e00-\u9fa5a-zA-Z0-9,。!?;()%《》""''、·—-]", "", line)
# 去除无意义短行
if len(line) >= 3:
cleaned.append(line)
return "\n".join(cleaned)
# NLP 文本预处理函数
def preprocess_text_for_nlp(text):
"""为 NLP 处理预处理文本"""
# 移除多余的空白字符
text = re.sub(r'\s+', ' ', text)
# 移除特殊字符但保留中文标点
text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9,。!?;():\-\s]', '', text)
# 限制文本长度(避免模型处理过长文本)
if len(text) > 10000:
text = text[:10000]
return text.strip()
# 提取第一个匹配项
def find_first(text, pattern, group=1):
"""使用正则表达式提取第一个匹配项"""
match = re.search(pattern, text)
return match.group(group) if match else None
# 提取所有匹配项(列表)
def find_all(text, pattern, group=1):
"""使用正则表达式提取所有匹配项"""
return [m.group(group) for m in re.finditer(pattern, text)]
# 文本清理和格式化函数
def normalize_text(text):
"""标准化文本格式"""
if not text:
return ""
# 统一换行符
text = text.replace('\r\n', '\n').replace('\r', '\n')
# 去除多余空格
text = re.sub(r'[ \t]+', ' ', text)
# 去除多余换行
text = re.sub(r'\n\s*\n', '\n', text)
# 去除行首行尾空格
lines = [line.strip() for line in text.split('\n')]
return '\n'.join(line for line in lines if line)
def extract_date(text):
match = re.search(r"(\d{4}[\-/年]\d{1,2}[\-/月]\d{1,2})", text)
if match:
try:
return datetime.strptime(match.group(1).replace("年", "-").replace("月", "-").replace("日", ""), "%Y-%m-%d")
except ValueError:
pass
return None
# 转换金额信息为阿拉伯数字
def extract_amount(text):
text = text.replace(",", "")
results = []
# 1. 提取阿拉伯数字金额(如 120万元、5000元)
arabic_pattern = r"(\d+(?:\.\d+)?)(?:\s*)?(万元|元)"
for match in re.finditer(arabic_pattern, text):
num_str, unit = match.groups()
try:
amount = float(num_str)
if unit == "万元":
amount *= 10000
results.append(amount)
except:
continue
# 2. 提取中文大写金额(如 壹佰贰拾万元整、伍仟元)
chinese_pattern = r"[人民币]?[零〇一二两三四五六七八九壹贰叁肆伍陆柒捌玖拾佰仟万亿]+[元圆](整|正)?"
for match in re.finditer(chinese_pattern, text):
full_match = match.group()
try:
amount = cn2an.cn2an(full_match, "smart")
results.append(float(amount))
except:
continue
return results if results else None