-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathclean_data.py
More file actions
54 lines (48 loc) · 1.5 KB
/
clean_data.py
File metadata and controls
54 lines (48 loc) · 1.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import re
#import pkuseg
#from pyhanlp import *
#HanLP.Config.ShowTermNature = False
#import pandas as pd
def readFile(path):
str_doc = ""
f2 = open('D:/code/bp/visulization/files/new_outtext.txt','w',encoding = "UTF-8")
#f3 = open('D:/code/bp/visulization/files/new_outtext无空格.txt','w',encoding = "UTF-8")
with open(path,'r',encoding='utf-8') as f:
#str_doc = f.read()
lines = f.readlines()
'''
#去除空行
str_doc = re.sub(r"\n+", "\n",out_str)
'''
a = ''
for line in lines:
a += line.strip()#去除换行
c = a.split()#将a分割成每个字符串
b = ''.join(c)#将c的每个字符不以任何符号直接连接
f2.write(b.replace(r"。",'。\n'))
#f2.write(b)
#分词
'''
seg = pkuseg.pkuseg(model_name='default') # 程序会自动下载所对应的细领域模型
for word in str3:
text = seg.cut(word) # 进行分词
print(text)
'''
'''
for word in str3:
text = HanLP.segment(word)
print(text)
'''
'''
data ={
'Description':str_out_list[::-1],
'Width':str4[::-1]
}
# list转dataframe
df = pd.DataFrame(data)
# 保存到本地excel
df.to_excel(r"D:\code\bp\pmt\dem\infor_extract\outfiles\outdes.xlsx", index=True)
'''
if __name__ == '__main__':
path= r'D:\code\bp\visulization\files\outtext.txt'
readFile(path)