From 93b64d2dd829fa266ad831129a26eddcd47abae7 Mon Sep 17 00:00:00 2001 From: lorax <142108266+lorax42@users.noreply.github.com> Date: Fri, 21 Feb 2025 09:56:53 +0100 Subject: [PATCH 1/3] fix [issue #305]: Improve teacher fuzzy finding * improve teacher fuzzy finding in additional info * use NLP to analyze the teacher's part of sentence * this minimizes false positives caused by just looking for teacher abbreviations in the DB since these can sometimes be other words --- backend/lesson_info.py | 23 +++++++++++++++++------ requirements.txt | 3 ++- server.py | 5 +++++ 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/backend/lesson_info.py b/backend/lesson_info.py index 4ca07ad..a9b5a8e 100644 --- a/backend/lesson_info.py +++ b/backend/lesson_info.py @@ -13,6 +13,7 @@ ) from . import teacher as teacher_model, blocks, typography_fixer from . import models +from server import nlp # import nlp for german (init in server) # Nicht verfügbare Räume: 1302 (1-2,7-10), 1306 (1-2,4,6) @@ -818,9 +819,9 @@ def process_additional_info_line(text: str, parsed_existing_forms: list[ParsedFo # TODO: Dates, Rooms funcs = ( - lambda s: add_fuzzy_teacher_links(s, teachers, date), - lambda s: add_fuzzy_form_links(s, parsed_existing_forms, date), - lambda s: add_fuzzy_room_links(s, rooms, date) + lambda s: add_fuzzy_teacher_links(s, text, teachers, date), + lambda s: add_fuzzy_form_links(s, text, parsed_existing_forms, date), + lambda s: add_fuzzy_room_links(s, text, rooms, date) ) segments = [LessonInfoTextSegment(text)] @@ -866,7 +867,7 @@ def add_fuzzy_with_validator( return segments -def add_fuzzy_form_links(text: str, parsed_existing_forms: list[ParsedForm], date: datetime.date +def add_fuzzy_form_links(text: str, _, parsed_existing_forms: list[ParsedForm], date: datetime.date ) -> list[LessonInfoTextSegment]: def validator(match: re.Match) -> list[LessonInfoTextSegment] | None: parsed_forms = ParsedForm.from_form_match(match) @@ -907,8 +908,18 @@ def validator(match: re.Match) -> list[LessonInfoTextSegment] | None: return add_fuzzy_with_validator(text, [_loose_parse_form_pattern], validator) -def add_fuzzy_teacher_links(text: str, teachers: teacher_model.Teachers, date: datetime.date): +def add_fuzzy_teacher_links(text: str, context: str, teachers: teacher_model.Teachers, date: datetime.date): def validator(match: re.Match) -> list[LessonInfoTextSegment] | None: + # check if word can be teacher (by grammar rules) + doc = nlp(context) # analyze info line + # find teacher in info + for i in doc: + if doc[i].text == text: + # check part of speach of suspected teacher + # has to be proper noun to be accepted + if doc[i].pos_ != "PROPN": + return None + surname_or_abbreviation = match.group() replacements = { @@ -938,7 +949,7 @@ def validator(match: re.Match) -> list[LessonInfoTextSegment] | None: return add_fuzzy_with_validator(text, [re.compile(_InfoParsers._teacher), re.compile(r"\b\w+\b")], validator) -def add_fuzzy_room_links(text: str, rooms: set[str], date: datetime.date): +def add_fuzzy_room_links(text: str, _, rooms: set[str], date: datetime.date): def validator(match: re.Match) -> list[LessonInfoTextSegment] | None: room = match.group() return [ diff --git a/requirements.txt b/requirements.txt index 1998b6d..2cf20df 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,4 +13,5 @@ brotlicffi~=1.1.0.0 pywebpush~=1.14.0 py-vapid~=1.9.0 aiohttp~=3.9.3 -icalendar~=6.0.0a0 \ No newline at end of file +icalendar~=6.0.0a0 +spacy~=3.8.4 diff --git a/server.py b/server.py index c8b282a..810d883 100644 --- a/server.py +++ b/server.py @@ -15,6 +15,8 @@ from endpoints.stats import stats import endpoints.webpush +import spacy # spacy for language analysis + from utils import User, AddStaticFileHashFlask, get_user, send_error, update_database, meta_to_database @@ -43,6 +45,8 @@ app.register_blueprint(api) app.register_blueprint(stats, url_prefix="/stats") +# spacy natural language processor +nlp = None @app.after_request def after_request(resp): @@ -98,4 +102,5 @@ def sw() -> Response: if __name__ == "__main__": update_database() + nlp = spacy.load("de_core_news_sm") # load german language model app.run(debug=DEBUG, host="0.0.0.0") From 647b41217855d40267b7f14469cb1fc244952ac1 Mon Sep 17 00:00:00 2001 From: lorax <142108266+lorax42@users.noreply.github.com> Date: Sat, 22 Feb 2025 10:00:40 +0100 Subject: [PATCH 2/3] refactor: move nlp setup from to lesson_info.py --- backend/lesson_info.py | 4 +++- client/package-lock.json | 4 ++-- server.py | 3 --- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/backend/lesson_info.py b/backend/lesson_info.py index a9b5a8e..d1494e8 100644 --- a/backend/lesson_info.py +++ b/backend/lesson_info.py @@ -6,6 +6,7 @@ import logging import re import typing +import spacy # spacy for language analysis from .vplan_utils import ( parse_periods, _parse_form_pattern, ParsedForm, parsed_forms_to_str, forms_to_str, @@ -13,7 +14,8 @@ ) from . import teacher as teacher_model, blocks, typography_fixer from . import models -from server import nlp # import nlp for german (init in server) + +nlp = spacy.load("de_core_news_sm") # load german language model # Nicht verfügbare Räume: 1302 (1-2,7-10), 1306 (1-2,4,6) diff --git a/client/package-lock.json b/client/package-lock.json index 73e5686..aca9902 100644 --- a/client/package-lock.json +++ b/client/package-lock.json @@ -1,12 +1,12 @@ { "name": "vplan-fr", - "version": "1.0.0", + "version": "1.1.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "vplan-fr", - "version": "1.0.0", + "version": "1.1.0", "dependencies": { "chart.js": "^4.4.2", "postcss": "^8.4.35", diff --git a/server.py b/server.py index 810d883..ecf1d24 100644 --- a/server.py +++ b/server.py @@ -15,8 +15,6 @@ from endpoints.stats import stats import endpoints.webpush -import spacy # spacy for language analysis - from utils import User, AddStaticFileHashFlask, get_user, send_error, update_database, meta_to_database @@ -102,5 +100,4 @@ def sw() -> Response: if __name__ == "__main__": update_database() - nlp = spacy.load("de_core_news_sm") # load german language model app.run(debug=DEBUG, host="0.0.0.0") From 6713fb217ea858704c906eb712ce6328913e7412 Mon Sep 17 00:00:00 2001 From: lorax <142108266+lorax42@users.noreply.github.com> Date: Fri, 28 Feb 2025 12:21:17 +0100 Subject: [PATCH 3/3] refactor(server): remove unnecessary declaration --- server.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/server.py b/server.py index ecf1d24..8de9c35 100644 --- a/server.py +++ b/server.py @@ -43,9 +43,6 @@ app.register_blueprint(api) app.register_blueprint(stats, url_prefix="/stats") -# spacy natural language processor -nlp = None - @app.after_request def after_request(resp): if request.path.startswith("/public"):