@@ -432,6 +432,7 @@ def extract_text_based_pdf(file_path, result):
432432 match = re .search (r"발행일\s*(\d{4}/\d{2}/\d{2})" , page_text )
433433 if match :
434434 result ["발행일" ] = match .group (1 ).strip ()
435+ print (f"발행일 발견: { result ['발행일' ]} " )
435436
436437 tables = page .find_tables (
437438 table_settings = {
@@ -489,6 +490,8 @@ def extract_text_based_pdf(file_path, result):
489490 for j , cell in enumerate (row ):
490491 if cell and str (cell ).strip ():
491492 clean_cell = str (cell ).replace ("\n " , " " ).strip ()
493+ # 셀 텍스트에서 구조 관련 띄어쓰기 미리 정리
494+ clean_cell = re .sub (r'구\s+조' , '구조' , clean_cell )
492495 header = header_row [j ] if j < len (header_row ) else f"컬럼{ j } "
493496
494497 # 순위번호 찾기
@@ -613,16 +616,14 @@ def check_legal_status(text_content, legal_status):
613616
614617
615618def extract_title_section_info (table_data , title_data ):
616- """표제부 정보를 추출하여 title_data 딕셔너리에 저장"""
619+
617620 _find_location_info (table_data , title_data )
618621 _find_jeonyu_info (table_data , title_data )
619622
620623
621624def _find_location_info (table_data , extracted_data ):
622625 """소재지번_건물명칭 정보 추출"""
623- if extracted_data ["소재지번_건물명칭" ]:
624- return
625-
626+
626627 for row in table_data :
627628 if not row :
628629 continue
@@ -631,7 +632,10 @@ def _find_location_info(table_data, extracted_data):
631632 continue
632633
633634 cell_text = str (cell ).strip ()
634- if any (keyword in cell_text for keyword in ["소재지번" , "건물명칭" , "도로명주소" ]):
635+
636+ # 소재지번_건물명칭 추출 (아직 없는 경우만)
637+ if (not extracted_data ["소재지번_건물명칭" ] and
638+ any (keyword in cell_text for keyword in ["소재지번" , "건물명칭" , "도로명주소" ])):
635639 for row_cell in row :
636640 if row_cell and str (row_cell ).strip ():
637641 cell_content = str (row_cell ).strip ()
@@ -641,8 +645,45 @@ def _find_location_info(table_data, extracted_data):
641645 ]):
642646 extracted_data ["소재지번_건물명칭" ] = cell_content .replace (
643647 "\n " , " " ).replace (" " , " " ).strip ()
644- return
645-
648+ break
649+
650+ # 헤더와 데이터 매핑 방식으로 건물내역 추출
651+ if not extracted_data .get ("건물내역" ):
652+ for i , row in enumerate (table_data ):
653+ if not row :
654+ continue
655+
656+ row_text = " " .join ([str (cell ) for cell in row if cell ])
657+
658+ # 헤더 행 찾기
659+ if "건물내역" in row_text or "건 물 내 역" in row_text :
660+ # 다음 행에서 데이터 찾기
661+ if i + 1 < len (table_data ):
662+ data_row = table_data [i + 1 ]
663+ if data_row :
664+ # 헤더의 "건물내역" 컬럼 위치 찾기
665+ header_index = - 1
666+ for j , header_cell in enumerate (row ):
667+ if header_cell and ("건물내역" in str (header_cell ) or "건 물 내 역" in str (header_cell )):
668+ header_index = j
669+ break
670+
671+ # 해당 위치의 데이터 추출
672+ if header_index >= 0 and header_index < len (data_row ):
673+ data_cell = data_row [header_index ]
674+ if data_cell and "구조" in str (data_cell ):
675+ cell_content = str (data_cell ).strip ()
676+
677+ # "구조"까지만 추출
678+ if "구조" in cell_content :
679+ structure_end = cell_content .find ("구조" ) + 2 # "구조" 길이 2
680+ structure = cell_content [:structure_end ].strip ()
681+
682+ if structure and len (structure ) > 3 :
683+ extracted_data ["건물내역" ] = structure
684+ print (f"1동 건물표시에서 구조 추출: '{ structure } '" )
685+ break
686+
646687
647688def _find_jeonyu_info (table_data , extracted_data ):
648689 """전유부분의 건물번호와 건물내역 정보 추출"""
@@ -711,42 +752,23 @@ def _analyze_row_for_building_info(row):
711752 building_info ["building_number" ] = cell_text
712753 break
713754
714- # 건물내역 패턴 검사
755+ # 건물내역 패턴 검사 - 구조 정보만 추출
715756 building_detail_patterns = [
716- r'.*구\s*조.*\d+\.?\d*\s*m2' ,
717- r'.*구\s*조.*\d+\.?\d*\s*㎡' ,
718- r'.*구조.*\d+\.?\d*\s*m2' ,
719- r'.*구조.*\d+\.?\d*\s*㎡' ,
757+ r'.*구\s*조.*' , # 면적 부분 제거, 구조만
758+ r'.*구조.*' , # 면적 부분 제거, 구조만
720759 ]
721760 for pattern in building_detail_patterns :
722761 if re .search (pattern , cell_text ) and not building_info ["building_detail" ]:
723- building_info ["building_detail" ] = cell_text
724- break
725-
726- # 면적 정보가 있는 경우 구조 정보와 결합
727- area_pattern = r'\d+\.?\d*\s*(m2|㎡)'
728- if re .search (area_pattern , cell_text ) and not building_info ["building_detail" ]:
729- structure_info = _find_structure_in_same_row (row , cell_idx )
730- if structure_info :
731- building_info ["building_detail" ] = f"{ structure_info } { cell_text } "
762+ # 면적 부분 제거하고 구조만 남기기
763+ structure_only = re .sub (r'\s*\d+\.?\d*\s*(?:m2|㎡)' , '' , cell_text ).strip ()
764+ # "구 조"를 "구조"로 변경
765+ structure_only = structure_only .replace ('구 조' , '구조' )
766+ if structure_only : # 빈 문자열이 아닌 경우만
767+ building_info ["building_detail" ] = structure_only
768+ break
732769
733770 return building_info
734771
735-
736- def _find_structure_in_same_row (row , exclude_cell_idx ):
737- """같은 행에서 구조 정보 찾기"""
738- structure_keywords = ["콘크리트" , "구조" , "철골" , "철근" , "목구조" , "벽돌" , "블록" ]
739- for cell_idx , cell in enumerate (row ):
740- if cell_idx == exclude_cell_idx or not cell :
741- continue
742-
743- cell_text = str (cell ).strip ()
744- if any (keyword in cell_text for keyword in structure_keywords ):
745- if not re .search (r'\d+\.?\d*\s*(m2|㎡)' , cell_text ):
746- return cell_text
747- return None
748-
749-
750772def _extract_latest_owner_info (gabgu_data ) -> Dict [str , Any ]:
751773 """갑구 데이터에서 최신 소유자 정보 추출"""
752774 latest_owner_info = {
@@ -891,11 +913,19 @@ def _prepare_risk_analysis_data(result, owner_info, mortgage_info) -> Dict[str,
891913 region_address = parts [0 ].strip () # 도로명주소 앞부분만
892914 road_address = parts [1 ].strip () # 도로명주소 부분만
893915
916+ # 건물내역에서 띄어쓰기 수정
917+ building_detail = result ["표제부" ].get ("건물내역" ) or ""
918+ if building_detail :
919+ building_detail = building_detail .replace ('구 조' , '구조' )
920+
894921 risk_data = {
895922 "region_address" : region_address ,
896923 "road_address" : road_address ,
924+ "building_number" : result ["표제부" ].get ("건물번호" ) or "" , # 건물번호 추가
925+ "building_detail" : building_detail , # 건물내역 추가
897926 "owner_name" : owner_info .get ("소유자명" ) or "" ,
898927 "owner_birth_date" : _parse_birth_date_from_id (owner_info .get ("주민번호" )),
928+ "issue_date" : result .get ("발행일" ) or "" , # 발행일 추가
899929 "has_seizure" : result ["법적상태" ]["가압류_여부" ],
900930 "has_auction" : result ["법적상태" ]["경매_여부" ],
901931 "has_litigation" : result ["법적상태" ]["소송_여부" ],
@@ -972,14 +1002,6 @@ def _extract_amount_from_korean(amount_str: str) -> Optional[int]:
9721002 return None
9731003
9741004
975- def extract_title_info (lines ):
976- """기존 함수 - 더 이상 사용하지 않지만 하위 호환성을 위해 유지"""
977- title_info = {}
978- for line in lines :
979- if "표제부" in line :
980- title_info ["표제부 라벨" ] = line
981- return title_info
982-
9831005
9841006def save_json (output_dict , output_path ):
9851007 """JSON 파일로 저장"""
0 commit comments