test_code/nlp_processor.py at main · 9dongb/test_code · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
"""
코드 ID: NLP_PROCESSOR_001
연결 요구사항: AI-REQ-F-001 (문서 분석 및 요구사항 자동 추출)
작성자: AI System
작성일: 2025-09-01
설명: 자연어 처리를 위한 텍스트 전처리 및 분석 모듈
"""

import re
import spacy
from typing import List, Dict, Tuple
import logging
from collections import Counter

# 코드 ID: NLP_PROCESSOR_001
class NLPProcessor:
    """
    자연어 처리 클래스
    - 텍스트 전처리, 토큰화, 개체명 인식
    - 요구사항 ID: AI-REQ-F-001과 연결
    """

    def __init__(self):
        # spaCy 모델 로드 (한국어 지원)
        try:
            self.nlp = spacy.load("ko_core_news_sm")
        except OSError:
            # 영어 모델로 대체
            self.nlp = spacy.load("en_core_web_sm")

        self.logger = logging.getLogger(__name__)

        # 요구사항 관련 키워드 정의
        self.requirement_keywords = [
            '요구사항', '필요', '해야', '할 수 있다', '기능', '시스템',
            '사용자', '관리자', '처리', '제공', '지원', '구현'
        ]

    def preprocess_text(self, text: str) -> str:
        """
        텍스트 전처리

        Args:
            text (str): 원본 텍스트

        Returns:
            str: 전처리된 텍스트
        """
        # 불필요한 문자 제거
        text = re.sub(r'[^\w\s가-힣]', ' ', text)

        # 연속된 공백 제거
        text = re.sub(r'\s+', ' ', text)

        # 앞뒤 공백 제거
        text = text.strip()

        return text

    def extract_sentences(self, text: str) -> List[str]:
        """
        문장 분리

        Args:
            text (str): 입력 텍스트

        Returns:
            List[str]: 분리된 문장 리스트
        """
        doc = self.nlp(text)
        sentences = [sent.text.strip() for sent in doc.sents if len(sent.text.strip()) > 10]
        return sentences

    def extract_entities(self, text: str) -> List[Dict[str, any]]:
        """
        개체명 인식

        Args:
            text (str): 입력 텍스트

        Returns:
            List[Dict]: 인식된 개체명 정보
        """
        doc = self.nlp(text)
        entities = []

        for ent in doc.ents:
            entities.append({
                'text': ent.text,
                'label': ent.label_,
                'start': ent.start_char,
                'end': ent.end_char
            })

        return entities

    def identify_requirement_candidates(self, sentences: List[str]) -> List[Dict[str, any]]:
        """
        요구사항 후보 문장 식별

        Args:
            sentences (List[str]): 문장 리스트

        Returns:
            List[Dict]: 요구사항 후보 정보
        """
        candidates = []

        for i, sentence in enumerate(sentences):
            score = 0

            # 키워드 매칭 점수 계산
            for keyword in self.requirement_keywords:
                if keyword in sentence:
                    score += 1

            # 문장 길이 고려 (너무 짧거나 길면 점수 감소)
            sentence_length = len(sentence.split())
            if 5 <= sentence_length <= 30:
                score += 1

            # 특정 패턴 매칭
            if re.search(r'.*는.*할\s+수\s+있다', sentence):
                score += 2
            if re.search(r'시스템.*제공', sentence):
                score += 2

            if score >= 2:  # 임계값 이상인 경우만 후보로 선정
                candidates.append({
                    'sentence': sentence,
                    'score': score,
                    'index': i,
                    'length': sentence_length
                })

        # 점수 내림차순 정렬
        candidates.sort(key=lambda x: x['score'], reverse=True)
        return candidates

    def extract_keywords(self, text: str, top_n: int = 10) -> List[Tuple[str, int]]:
        """
        키워드 추출

        Args:
            text (str): 입력 텍스트
            top_n (int): 상위 키워드 개수

        Returns:
            List[Tuple[str, int]]: (키워드, 빈도) 튜플 리스트
        """
        doc = self.nlp(text)

        # 명사만 추출 (불용어 제외)
        keywords = [token.text for token in doc
                   if token.pos_ in ['NOUN', 'PROPN']
                   and len(token.text) > 1
                   and not token.is_stop]

        # 빈도 계산
        keyword_freq = Counter(keywords)
        return keyword_freq.most_common(top_n)