Skip to content

Commit 73db99d

Browse files
authored
✨ feat: 건축물대장, 등기부등본 추가 출력
✨ feat: 건축물대장, 등기부등본 추가 출력
2 parents a218137 + edcfca0 commit 73db99d

2 files changed

Lines changed: 105 additions & 47 deletions

File tree

extractors/building_parser.py

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,7 @@ def extract_text_based_pdf(self, doc, last_word: str) -> Optional[Dict]:
193193

194194
approval_date = self.find_approval_date_improved(all_pages_text)
195195
violation_status = self.find_violation_status(all_pages_text)
196+
issue_date = self.find_issue_date(all_pages_text)
196197

197198
return {
198199
"대지위치": basic_info.get("대지위치", ""),
@@ -203,7 +204,8 @@ def extract_text_based_pdf(self, doc, last_word: str) -> Optional[Dict]:
203204
"층수": basic_info.get("층수", None),
204205
"용도": basic_info.get("용도", []),
205206
"사용승인일": approval_date,
206-
"위반건축물여부": violation_status
207+
"위반건축물여부": violation_status,
208+
"발급일": issue_date # 발급일 추가
207209
}
208210

209211
def extract_image_based_pdf_improved(self, doc, last_word: str) -> Optional[Dict]:
@@ -258,6 +260,7 @@ def extract_image_based_pdf_improved(self, doc, last_word: str) -> Optional[Dict
258260

259261
cropped_text = self.reconstruct_text_from_ocr(cropped_words)
260262
basic_info = self.extract_basic_info_from_text_with_land_area(cropped_text)
263+
261264

262265
# 개선된 사용승인일 검색
263266
print("전체 PDF에서 사용승인일 검색 중...")
@@ -311,6 +314,7 @@ def extract_image_based_pdf_improved(self, doc, last_word: str) -> Optional[Dict
311314
# 개선된 사용승인일 검색 사용
312315
approval_date = self.find_approval_date_improved(all_pages_text, ocr_words_by_page)
313316
violation_status = self.find_violation_status(all_pages_text)
317+
issue_date = self.find_issue_date(all_pages_text)
314318

315319
return {
316320
"대지위치": basic_info.get("대지위치", ""),
@@ -321,7 +325,8 @@ def extract_image_based_pdf_improved(self, doc, last_word: str) -> Optional[Dict
321325
"층수": basic_info.get("층수", None),
322326
"용도": basic_info.get("용도", []),
323327
"사용승인일": approval_date,
324-
"위반건축물여부": violation_status
328+
"위반건축물여부": violation_status,
329+
"발급일": issue_date # 발급일 추가
325330
}
326331

327332
def extract_basic_info_from_text_with_land_area(self, text):
@@ -788,6 +793,35 @@ def parse_area_block(self, lines: List[str]) -> Dict[str, float]:
788793
print(f"[parse_area_block] 매핑 결과: {result}")
789794
return result
790795

796+
def find_issue_date(self, full_text):
797+
"""발급일 찾기"""
798+
799+
# 발급일 패턴들
800+
issue_date_patterns = [
801+
r'(?:발\s*급\s*일)\s*:?\s*(\d{4})년\s*(\d{1,2})월\s*(\d{1,2})일',
802+
r'(?:발\s*급\s*일)\s*:?\s*(\d{4})\.\s*(\d{1,2})\.\s*(\d{1,2})',
803+
r'(?:발\s*급\s*일)\s*:?\s*(\d{4})-(\d{1,2})-(\d{1,2})',
804+
r'(?:발\s*급\s*일)\s*:?\s*(\d{4})/(\d{1,2})/(\d{1,2})',
805+
]
806+
807+
for pattern in issue_date_patterns:
808+
match = re.search(pattern, full_text)
809+
if match:
810+
year, month, day = match.groups()
811+
try:
812+
int_year = int(year)
813+
int_month = int(month)
814+
int_day = int(day)
815+
816+
if 2020 <= int_year <= 2050 and 1 <= int_month <= 12 and 1 <= int_day <= 31:
817+
result_date = f"{year}.{month.zfill(2)}.{day.zfill(2)}"
818+
print(f"발급일 발견: {result_date}")
819+
return result_date
820+
except ValueError:
821+
continue
822+
823+
return ""
824+
791825

792826
def reconstruct_text_from_ocr(self, cropped_words) -> str:
793827
"""OCR 결과를 줄별로 재구성하여 텍스트 형태로 변환"""
@@ -1085,6 +1119,8 @@ def save_to_json(self, result: Dict, pdf_path: str, output_dir: str = "../data/o
10851119
json_result[key] = "정보없음"
10861120
elif key == "용도" and isinstance(value, list):
10871121
json_result[key] = value if value else ["정보없음"]
1122+
elif key in ["사용승인일", "발급일"] and (value == "" or value is None): # 발급일 처리 추가
1123+
json_result[key] = "정보없음"
10881124
else:
10891125
json_result[key] = value if value != "" else "정보없음"
10901126

@@ -1101,7 +1137,7 @@ def main():
11011137
parser.add_argument('--last-word', '-l', default='m',
11021138
help='크롭 범위의 끝 단어 (기본값: m)')
11031139
parser.add_argument(
1104-
'--output-dir', '-o', default='../data/output/building_json', help='결과 저장 디렉토리')
1140+
'--output-dir', '-o', default='C:/LLM/data/output/building_json', help='결과 저장 디렉토리')
11051141
parser.add_argument(
11061142
'--debug', '-d', action='store_true', help='디버깅 모드 활성화')
11071143

extractors/register_parser.py

Lines changed: 66 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -432,6 +432,7 @@ def extract_text_based_pdf(file_path, result):
432432
match = re.search(r"발행일\s*(\d{4}/\d{2}/\d{2})", page_text)
433433
if match:
434434
result["발행일"] = match.group(1).strip()
435+
print(f"발행일 발견: {result['발행일']}")
435436

436437
tables = page.find_tables(
437438
table_settings={
@@ -489,6 +490,8 @@ def extract_text_based_pdf(file_path, result):
489490
for j, cell in enumerate(row):
490491
if cell and str(cell).strip():
491492
clean_cell = str(cell).replace("\n", " ").strip()
493+
# 셀 텍스트에서 구조 관련 띄어쓰기 미리 정리
494+
clean_cell = re.sub(r'구\s+조', '구조', clean_cell)
492495
header = header_row[j] if j < len(header_row) else f"컬럼{j}"
493496

494497
# 순위번호 찾기
@@ -613,16 +616,14 @@ def check_legal_status(text_content, legal_status):
613616

614617

615618
def extract_title_section_info(table_data, title_data):
616-
"""표제부 정보를 추출하여 title_data 딕셔너리에 저장"""
619+
617620
_find_location_info(table_data, title_data)
618621
_find_jeonyu_info(table_data, title_data)
619622

620623

621624
def _find_location_info(table_data, extracted_data):
622625
"""소재지번_건물명칭 정보 추출"""
623-
if extracted_data["소재지번_건물명칭"]:
624-
return
625-
626+
626627
for row in table_data:
627628
if not row:
628629
continue
@@ -631,7 +632,10 @@ def _find_location_info(table_data, extracted_data):
631632
continue
632633

633634
cell_text = str(cell).strip()
634-
if any(keyword in cell_text for keyword in ["소재지번", "건물명칭", "도로명주소"]):
635+
636+
# 소재지번_건물명칭 추출 (아직 없는 경우만)
637+
if (not extracted_data["소재지번_건물명칭"] and
638+
any(keyword in cell_text for keyword in ["소재지번", "건물명칭", "도로명주소"])):
635639
for row_cell in row:
636640
if row_cell and str(row_cell).strip():
637641
cell_content = str(row_cell).strip()
@@ -641,8 +645,45 @@ def _find_location_info(table_data, extracted_data):
641645
]):
642646
extracted_data["소재지번_건물명칭"] = cell_content.replace(
643647
"\n", " ").replace(" ", " ").strip()
644-
return
645-
648+
break
649+
650+
# 헤더와 데이터 매핑 방식으로 건물내역 추출
651+
if not extracted_data.get("건물내역"):
652+
for i, row in enumerate(table_data):
653+
if not row:
654+
continue
655+
656+
row_text = " ".join([str(cell) for cell in row if cell])
657+
658+
# 헤더 행 찾기
659+
if "건물내역" in row_text or "건 물 내 역" in row_text:
660+
# 다음 행에서 데이터 찾기
661+
if i + 1 < len(table_data):
662+
data_row = table_data[i + 1]
663+
if data_row:
664+
# 헤더의 "건물내역" 컬럼 위치 찾기
665+
header_index = -1
666+
for j, header_cell in enumerate(row):
667+
if header_cell and ("건물내역" in str(header_cell) or "건 물 내 역" in str(header_cell)):
668+
header_index = j
669+
break
670+
671+
# 해당 위치의 데이터 추출
672+
if header_index >= 0 and header_index < len(data_row):
673+
data_cell = data_row[header_index]
674+
if data_cell and "구조" in str(data_cell):
675+
cell_content = str(data_cell).strip()
676+
677+
# "구조"까지만 추출
678+
if "구조" in cell_content:
679+
structure_end = cell_content.find("구조") + 2 # "구조" 길이 2
680+
structure = cell_content[:structure_end].strip()
681+
682+
if structure and len(structure) > 3:
683+
extracted_data["건물내역"] = structure
684+
print(f"1동 건물표시에서 구조 추출: '{structure}'")
685+
break
686+
646687

647688
def _find_jeonyu_info(table_data, extracted_data):
648689
"""전유부분의 건물번호와 건물내역 정보 추출"""
@@ -711,42 +752,23 @@ def _analyze_row_for_building_info(row):
711752
building_info["building_number"] = cell_text
712753
break
713754

714-
# 건물내역 패턴 검사
755+
# 건물내역 패턴 검사 - 구조 정보만 추출
715756
building_detail_patterns = [
716-
r'.*구\s*조.*\d+\.?\d*\s*m2',
717-
r'.*구\s*조.*\d+\.?\d*\s*㎡',
718-
r'.*구조.*\d+\.?\d*\s*m2',
719-
r'.*구조.*\d+\.?\d*\s*㎡',
757+
r'.*구\s*조.*', # 면적 부분 제거, 구조만
758+
r'.*구조.*', # 면적 부분 제거, 구조만
720759
]
721760
for pattern in building_detail_patterns:
722761
if re.search(pattern, cell_text) and not building_info["building_detail"]:
723-
building_info["building_detail"] = cell_text
724-
break
725-
726-
# 면적 정보가 있는 경우 구조 정보와 결합
727-
area_pattern = r'\d+\.?\d*\s*(m2|㎡)'
728-
if re.search(area_pattern, cell_text) and not building_info["building_detail"]:
729-
structure_info = _find_structure_in_same_row(row, cell_idx)
730-
if structure_info:
731-
building_info["building_detail"] = f"{structure_info} {cell_text}"
762+
# 면적 부분 제거하고 구조만 남기기
763+
structure_only = re.sub(r'\s*\d+\.?\d*\s*(?:m2|㎡)', '', cell_text).strip()
764+
# "구 조"를 "구조"로 변경
765+
structure_only = structure_only.replace('구 조', '구조')
766+
if structure_only: # 빈 문자열이 아닌 경우만
767+
building_info["building_detail"] = structure_only
768+
break
732769

733770
return building_info
734771

735-
736-
def _find_structure_in_same_row(row, exclude_cell_idx):
737-
"""같은 행에서 구조 정보 찾기"""
738-
structure_keywords = ["콘크리트", "구조", "철골", "철근", "목구조", "벽돌", "블록"]
739-
for cell_idx, cell in enumerate(row):
740-
if cell_idx == exclude_cell_idx or not cell:
741-
continue
742-
743-
cell_text = str(cell).strip()
744-
if any(keyword in cell_text for keyword in structure_keywords):
745-
if not re.search(r'\d+\.?\d*\s*(m2|㎡)', cell_text):
746-
return cell_text
747-
return None
748-
749-
750772
def _extract_latest_owner_info(gabgu_data) -> Dict[str, Any]:
751773
"""갑구 데이터에서 최신 소유자 정보 추출"""
752774
latest_owner_info = {
@@ -891,11 +913,19 @@ def _prepare_risk_analysis_data(result, owner_info, mortgage_info) -> Dict[str,
891913
region_address = parts[0].strip() # 도로명주소 앞부분만
892914
road_address = parts[1].strip() # 도로명주소 부분만
893915

916+
# 건물내역에서 띄어쓰기 수정
917+
building_detail = result["표제부"].get("건물내역") or ""
918+
if building_detail:
919+
building_detail = building_detail.replace('구 조', '구조')
920+
894921
risk_data = {
895922
"region_address": region_address,
896923
"road_address": road_address,
924+
"building_number": result["표제부"].get("건물번호") or "", # 건물번호 추가
925+
"building_detail": building_detail, # 건물내역 추가
897926
"owner_name": owner_info.get("소유자명") or "",
898927
"owner_birth_date": _parse_birth_date_from_id(owner_info.get("주민번호")),
928+
"issue_date": result.get("발행일") or "", # 발행일 추가
899929
"has_seizure": result["법적상태"]["가압류_여부"],
900930
"has_auction": result["법적상태"]["경매_여부"],
901931
"has_litigation": result["법적상태"]["소송_여부"],
@@ -972,14 +1002,6 @@ def _extract_amount_from_korean(amount_str: str) -> Optional[int]:
9721002
return None
9731003

9741004

975-
def extract_title_info(lines):
976-
"""기존 함수 - 더 이상 사용하지 않지만 하위 호환성을 위해 유지"""
977-
title_info = {}
978-
for line in lines:
979-
if "표제부" in line:
980-
title_info["표제부 라벨"] = line
981-
return title_info
982-
9831005

9841006
def save_json(output_dict, output_path):
9851007
"""JSON 파일로 저장"""

0 commit comments

Comments
 (0)