-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbuild_features.py
More file actions
59 lines (46 loc) · 1.96 KB
/
build_features.py
File metadata and controls
59 lines (46 loc) · 1.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# -*- coding: utf-8 -*-
import pandas as pd
import os
from pathlib import Path
from tqdm import tqdm
from build_feature_functions import ner_calc, geo_mean_prob, max_sent_len, cefr_vocab_calc
'''
Vorbereitung und Berechnung der Features zu den einzelnen Snippets
Es werden CSV Dateien gespeichert
CSV-Format:
Spalten: ['Titel', 'Snippet_NR', 'Snippet', 'POS','NER', 'word_rarity',
'max_sent_len', 'a1', 'a2', 'b1', 'b2', 'c1','unknown', 'Quelle',
'Label']
'''
tqdm.pandas(desc="Processing rows")
def add_features(snippet_folder, feature_folder, overwrite, max_files):
feature_folder = Path(feature_folder)
if not os.path.exists(feature_folder):
os.makedirs(feature_folder)
files = [file for file in Path(snippet_folder).glob('*')]
if overwrite:
[file.unlink() for file in Path(feature_folder).glob('*')]
else:
already_parsed = [str(file).split('_')[-1]
for file in Path(feature_folder).glob('*')]
files = [file for file in files if str(
file).split('_')[-1] not in already_parsed]
files = files[:max_files]
### KONKRETE BERECHNUNG DER EINZELNEN FEATURES ###
for i, csv in enumerate(files):
df_features = pd.read_csv(csv)
file_ending = str(csv).split('_')[-1]
spacy_spalten = df_features['Snippet'].progress_apply(
ner_calc)
df_features = pd.concat(
[df_features, spacy_spalten.fillna(0)], axis=1)
df_features['word_rarity'] = df_features['Snippet'].progress_apply(
geo_mean_prob)
df_features['max_sent_len'] = df_features['Snippet'].progress_apply(
max_sent_len)
# CEFR Vokabeln
df_features[['a1', 'a2', 'b1', 'b2', 'c1', 'unknown']] = \
df_features['Snippet'].progress_apply(
cefr_vocab_calc).progress_apply(pd.Series)
df_features.to_csv(
feature_folder / f'snippets_with_features_{file_ending}', index=True)