diff --git a/src/main/java/org/sopt/kareer/domain/jobposting/util/ResumeContextService.java b/src/main/java/org/sopt/kareer/domain/jobposting/util/ResumeContextService.java index 0391086..fc2dc1b 100644 --- a/src/main/java/org/sopt/kareer/domain/jobposting/util/ResumeContextService.java +++ b/src/main/java/org/sopt/kareer/domain/jobposting/util/ResumeContextService.java @@ -2,11 +2,10 @@ import lombok.RequiredArgsConstructor; import org.sopt.kareer.domain.jobposting.exception.JobPostingException; -import org.sopt.kareer.global.external.clova.service.DocumentProcessingService; +import org.sopt.kareer.global.document.service.DocumentProcessingService; import org.springframework.stereotype.Service; import org.springframework.web.multipart.MultipartFile; -import java.io.File; import java.util.List; import static org.sopt.kareer.domain.jobposting.exception.JobPostingErrorCode.RESUME_CONTEXT_FAILED; @@ -32,13 +31,8 @@ public String buildContext(List files) { sb.append("[RESUME_COVER_LETTER]\n"); for (MultipartFile file : files) { - File temp = null; - try { - temp = File.createTempFile("resume_", ".pdf"); - file.transferTo(temp); - - String text = documentProcessingService.extractTextWithOcr(temp); + String text = documentProcessingService.extractText(file); sb.append("----- FILE START -----\n"); sb.append(text).append("\n"); @@ -46,10 +40,6 @@ public String buildContext(List files) { } catch (Exception e) { throw new JobPostingException(RESUME_CONTEXT_FAILED, e.getMessage()); - } finally { - if (temp != null && temp.exists()) { - temp.delete(); - } } } diff --git a/src/main/java/org/sopt/kareer/domain/member/controller/MemberController.java b/src/main/java/org/sopt/kareer/domain/member/controller/MemberController.java index c6eb2b9..3b70ea5 100644 --- a/src/main/java/org/sopt/kareer/domain/member/controller/MemberController.java +++ b/src/main/java/org/sopt/kareer/domain/member/controller/MemberController.java @@ -1,8 +1,6 @@ package org.sopt.kareer.domain.member.controller; -import static org.sopt.kareer.global.config.swagger.SwaggerResponseDescription.*; - import io.swagger.v3.oas.annotations.Operation; import io.swagger.v3.oas.annotations.tags.Tag; import jakarta.servlet.http.HttpServletRequest; @@ -12,7 +10,9 @@ import org.sopt.kareer.domain.member.dto.request.MemberOnboardRequest; import org.sopt.kareer.domain.member.dto.request.MypageRequest; import org.sopt.kareer.domain.member.dto.response.*; -import org.sopt.kareer.domain.member.entity.constants.*; +import org.sopt.kareer.domain.member.entity.constants.Field; +import org.sopt.kareer.domain.member.entity.constants.Major; +import org.sopt.kareer.domain.member.entity.constants.University; import org.sopt.kareer.domain.member.entity.enums.Country; import org.sopt.kareer.domain.member.service.MemberService; import org.sopt.kareer.domain.roadmap.dto.response.RoadmapTestResponse; @@ -23,9 +23,13 @@ import org.sopt.kareer.global.config.swagger.SwaggerResponseDescription; import org.sopt.kareer.global.response.BaseResponse; import org.springframework.http.HttpStatus; +import org.springframework.http.MediaType; import org.springframework.http.ResponseEntity; import org.springframework.security.core.annotation.AuthenticationPrincipal; import org.springframework.web.bind.annotation.*; +import org.springframework.web.multipart.MultipartFile; + +import static org.sopt.kareer.global.config.swagger.SwaggerResponseDescription.*; @RestController @RequiredArgsConstructor @@ -155,4 +159,21 @@ public ResponseEntity> deleteMember(@AuthenticationPrincipal .body(BaseResponse.ok("회원 탈퇴에 성공하였습니다.")); } + @Operation(summary = "온보딩 비자 OCR API", description = "온보딩 과정에서 유저의 비자 문서를 분석하여 정보를 추출합니다.") + @PostMapping(value = "/onboard/ocr/visa", consumes = MediaType.MULTIPART_FORM_DATA_VALUE) + public ResponseEntity> getVisaInfo( + @RequestPart("file") MultipartFile file){ + return ResponseEntity.status(HttpStatus.OK) + .body(BaseResponse.ok(memberService.getVisaOcr(file), "사용자 비자 정보 추출에 성공했습니다.")); + } + + @Operation(summary = "온보딩 여권 OCR API", description = "온보딩 과정에서 유저의 여권을 분석하여 정보를 추출합니다.") + @PostMapping(value = "/onboard/ocr/passport", consumes = MediaType.MULTIPART_FORM_DATA_VALUE) + public ResponseEntity> getPassportInfo( + @RequestPart("file") MultipartFile file){ + return ResponseEntity.status(HttpStatus.OK) + .body(BaseResponse.ok(memberService.getPassportOcr(file), "사용자 여권 정보 추출에 성공했습니다.")); + } + + } diff --git a/src/main/java/org/sopt/kareer/domain/member/dto/response/OcrPassportResponse.java b/src/main/java/org/sopt/kareer/domain/member/dto/response/OcrPassportResponse.java new file mode 100644 index 0000000..85063ff --- /dev/null +++ b/src/main/java/org/sopt/kareer/domain/member/dto/response/OcrPassportResponse.java @@ -0,0 +1,28 @@ +package org.sopt.kareer.domain.member.dto.response; + +import io.swagger.v3.oas.annotations.media.Schema; +import lombok.Builder; +import org.sopt.kareer.domain.member.entity.enums.Country; +import org.sopt.kareer.domain.member.util.PassportOcrParser; + +import java.time.LocalDate; + +@Builder +public record OcrPassportResponse( + @Schema(description = "Full Name", example = "Hong Seungwon") + String fullName, + + @Schema(description = "국가", example = "AFGHANISTAN") + Country country, + + @Schema(description = "생년월일") + LocalDate birthDate +) { + public static OcrPassportResponse from(PassportOcrParser.PassportInfo passportInfo) { + return OcrPassportResponse.builder() + .fullName(passportInfo.fullName()) + .country(passportInfo.country()) + .birthDate(passportInfo.birthDate()) + .build(); + } +} diff --git a/src/main/java/org/sopt/kareer/domain/member/dto/response/OcrVisaResponse.java b/src/main/java/org/sopt/kareer/domain/member/dto/response/OcrVisaResponse.java new file mode 100644 index 0000000..b237fdb --- /dev/null +++ b/src/main/java/org/sopt/kareer/domain/member/dto/response/OcrVisaResponse.java @@ -0,0 +1,29 @@ +package org.sopt.kareer.domain.member.dto.response; + +import io.swagger.v3.oas.annotations.media.Schema; +import lombok.Builder; +import org.sopt.kareer.domain.member.entity.enums.VisaType; +import org.sopt.kareer.domain.member.util.VisaOcrParser; + +import java.time.LocalDate; + +@Builder +public record OcrVisaResponse( + @Schema(description = "비자 유형") + VisaType visaType, + + @Schema(description = "비자 발급일") + LocalDate visaStartDate, + + @Schema(description = "비자 만료일") + LocalDate visaExpiredAt +){ + public static OcrVisaResponse from(VisaOcrParser.VisaInfo visaInfo) { + return OcrVisaResponse.builder() + .visaType(visaInfo.visaType()) + .visaStartDate(visaInfo.visaStartDate()) + .visaExpiredAt(visaInfo.visaExpiredAt()) + .build(); + } +} + diff --git a/src/main/java/org/sopt/kareer/domain/member/entity/enums/VisaType.java b/src/main/java/org/sopt/kareer/domain/member/entity/enums/VisaType.java index 61b4282..1efb8b8 100644 --- a/src/main/java/org/sopt/kareer/domain/member/entity/enums/VisaType.java +++ b/src/main/java/org/sopt/kareer/domain/member/entity/enums/VisaType.java @@ -12,4 +12,25 @@ public enum VisaType { ; private final String description; + + public static VisaType from(String originalText) { + if (originalText == null || originalText.isBlank()) { + return null; + } + + String normalized = normalize(originalText); + + for (VisaType visaType : values()) { + if (normalize(visaType.name()).equals(normalized) + || normalize(visaType.description).equals(normalized)) { + return visaType; + } + } + + return null; + } + + private static String normalize(String value) { + return value.replaceAll("[^A-Za-z0-9]", "").toUpperCase(); + } } diff --git a/src/main/java/org/sopt/kareer/domain/member/service/MemberService.java b/src/main/java/org/sopt/kareer/domain/member/service/MemberService.java index 69d6405..10f27d6 100644 --- a/src/main/java/org/sopt/kareer/domain/member/service/MemberService.java +++ b/src/main/java/org/sopt/kareer/domain/member/service/MemberService.java @@ -1,19 +1,29 @@ package org.sopt.kareer.domain.member.service; import lombok.RequiredArgsConstructor; -import org.sopt.kareer.domain.member.dto.request.*; +import org.sopt.kareer.domain.member.dto.request.MemberOnboardRequest; +import org.sopt.kareer.domain.member.dto.request.MemberOnboardV2Request; import org.sopt.kareer.domain.member.dto.response.*; -import org.sopt.kareer.domain.member.entity.*; +import org.sopt.kareer.domain.member.entity.Member; +import org.sopt.kareer.domain.member.entity.MemberVisa; import org.sopt.kareer.domain.member.entity.enums.MemberStatus; -import org.sopt.kareer.domain.member.exception.*; -import org.sopt.kareer.domain.member.repository.*; +import org.sopt.kareer.domain.member.exception.MemberErrorCode; +import org.sopt.kareer.domain.member.exception.MemberException; +import org.sopt.kareer.domain.member.repository.MemberRepository; +import org.sopt.kareer.domain.member.repository.MemberVisaRepository; import org.sopt.kareer.domain.member.service.dto.request.MypageCommand; +import org.sopt.kareer.domain.member.util.PassportOcrParser; +import org.sopt.kareer.domain.member.util.VisaOcrParser; +import org.sopt.kareer.global.document.exception.DocumentErrorCode; +import org.sopt.kareer.global.document.exception.DocumentException; +import org.sopt.kareer.global.document.service.DocumentProcessingService; import org.sopt.kareer.global.exception.customexception.GlobalException; import org.sopt.kareer.global.exception.errorcode.GlobalErrorCode; import org.sopt.kareer.global.oauth.dto.OAuthAttributes; import org.springframework.dao.DataIntegrityViolationException; import org.springframework.stereotype.Service; import org.springframework.transaction.annotation.Transactional; +import org.springframework.web.multipart.MultipartFile; @Service @RequiredArgsConstructor @@ -23,6 +33,9 @@ public class MemberService { private final MemberRepository memberRepository; private final MemberVisaRepository memberVisaRepository; private final MemberDeletionService memberDeletionService; + private final DocumentProcessingService documentProcessingService; + private final VisaOcrParser visaOcrParser; + private final PassportOcrParser passportOcrParser; public Member getById(Long memberId) { return memberRepository.findById(memberId) @@ -166,4 +179,35 @@ public void deleteMember(Long memberId) { Member member = getById(memberId); memberDeletionService.deleteMember(member); } + + + public OcrVisaResponse getVisaOcr(MultipartFile file){ + try { + String text = documentProcessingService.extractText(file); + VisaOcrParser.VisaInfo visaInfo = visaOcrParser.parse(text); + + return OcrVisaResponse.from(visaInfo); + } catch (DocumentException e) { + throw e; + } catch(Exception e) { + throw new DocumentException( + DocumentErrorCode.OCR_PROCESSING_FAILED + ); + } + } + + public OcrPassportResponse getPassportOcr(MultipartFile file) { + try { + String text = documentProcessingService.extractText(file); + PassportOcrParser.PassportInfo passportInfo = passportOcrParser.parse(text); + + return OcrPassportResponse.from(passportInfo); + } catch (DocumentException e) { + throw e; + } catch(Exception e) { + throw new DocumentException( + DocumentErrorCode.OCR_PROCESSING_FAILED + ); + } + } } diff --git a/src/main/java/org/sopt/kareer/domain/member/util/CountryResolver.java b/src/main/java/org/sopt/kareer/domain/member/util/CountryResolver.java new file mode 100644 index 0000000..125417e --- /dev/null +++ b/src/main/java/org/sopt/kareer/domain/member/util/CountryResolver.java @@ -0,0 +1,51 @@ +package org.sopt.kareer.domain.member.util; + +import org.sopt.kareer.domain.member.entity.enums.Country; +import org.springframework.stereotype.Component; + +import java.util.HashMap; +import java.util.Locale; +import java.util.Map; + +@Component +public class CountryResolver { + + private final Map ISO3_MAP; + + public CountryResolver() { + ISO3_MAP = buildIso3Map(); + } + + private Map buildIso3Map() { + Map map = new HashMap<>(); + + for (Country country : Country.values()) { + try { + Locale locale = findLocaleByCountryName(country.getCountryName()); + + if (locale != null) { + String iso3 = locale.getISO3Country(); + map.put(iso3, country); + } + + } catch (Exception ignored) { + } + } + + return map; + } + + private Locale findLocaleByCountryName(String countryName) { + for (Locale locale : Locale.getAvailableLocales()) { + if (countryName.equalsIgnoreCase(locale.getDisplayCountry(Locale.ENGLISH))) { + return locale; + } + } + return null; + } + + public Country resolveIso3(String iso3) { + if (iso3 == null) return null; + return ISO3_MAP.get(iso3.toUpperCase()); + } +} diff --git a/src/main/java/org/sopt/kareer/domain/member/util/PassportOcrParser.java b/src/main/java/org/sopt/kareer/domain/member/util/PassportOcrParser.java new file mode 100644 index 0000000..b9b4204 --- /dev/null +++ b/src/main/java/org/sopt/kareer/domain/member/util/PassportOcrParser.java @@ -0,0 +1,201 @@ +package org.sopt.kareer.domain.member.util; + +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.sopt.kareer.domain.member.entity.enums.Country; +import org.sopt.kareer.global.document.util.DocumentDateUtils; +import org.sopt.kareer.global.document.util.DocumentTextUtils; +import org.springframework.stereotype.Component; + +import java.time.LocalDate; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +@Slf4j +@Component +@RequiredArgsConstructor +public class PassportOcrParser { + + private static final int TD3_LENGTH = 44; + + private final CountryResolver countryResolver; + + public PassportInfo parse(String rawText) { + String text = DocumentTextUtils.normalize(rawText); + Mrz mrz = extractMrz(text); + + if (mrz == null) { + return new PassportInfo(null, null, null); + } + + String fullName = extractName(mrz.line1()); + Country country = extractCountry(mrz.line1(), mrz.line2()); + LocalDate birthDate = extractBirth(mrz.line2()); + + return new PassportInfo(fullName, country, birthDate); + } + + private Mrz extractMrz(String text) { + List candidates = collectCandidates(text); + + String line1 = null; + String line2 = null; + + for (String c : candidates) { + + if (line1 == null && isLine1(c)) { + line1 = normalizeLineLength(c); + continue; + } + + if (line2 == null && isLine2(c)) { + line2 = normalizeLineLength(c); + } + + if (line1 != null && line2 != null) break; + } + + if (line1 == null && line2 == null) return null; + + return new Mrz(line1, line2); + } + + private List collectCandidates(String text) { + List result = new ArrayList<>(); + String upper = text.toUpperCase(Locale.ROOT); + + for (String token : upper.split("\\s+")) { + String cleaned = sanitize(token); + if (cleaned.length() >= 20) result.add(cleaned); + } + + Matcher m = Pattern.compile("[A-Z0-9<]{20,}").matcher(upper); + while (m.find()) { + result.add(sanitize(m.group())); + } + + return result; + } + + private String sanitize(String v) { + return v.replaceAll("[^A-Z0-9<]", ""); + } + + private boolean isLine1(String v) { + if (v == null || v.length() < 20) return false; + + String fitted = normalizeLineLength(v); + + if (fitted.charAt(0) != 'P') return false; + + // 3~5: 국가코드 + if (!substringSafely(fitted, 2, 5).matches("[A-Z]{3}")) return false; + + return fitted.contains("<<"); + } + + private boolean isLine2(String v) { + if (v == null || v.length() < 20) return false; + + String fitted = normalizeLineLength(v); + + // 11~13: 국가코드 + if (!substringSafely(fitted, 10, 13).matches("[A-Z]{3}")) return false; + + // 14~19: 생년월일 + return substringSafely(fitted, 13, 19).matches("\\d{6}"); + } + + private String normalizeLineLength(String v) { + if (v.length() >= TD3_LENGTH) { + return v.substring(0, TD3_LENGTH); + } + return v + "<".repeat(TD3_LENGTH - v.length()); + } + + private String substringSafely(String v, int s, int e) { + if (v.length() < e) return ""; + return v.substring(s, e); + } + + private String extractName(String line1) { + if (line1 == null) return null; + + try { + String fitted = normalizeLineLength(line1); + + // 반드시 P로 시작해야 함 + if (fitted.charAt(0) != 'P') return null; + + // 국가코드 이후부터 이름 + String body = fitted.substring(5); + + String[] parts = body.split("<<", 2); + if (parts.length < 2) return null; + + String surname = normalizeName(parts[0]); + String given = normalizeName(parts[1]); + + String name = (surname + " " + given).trim(); + return name.isBlank() ? null : name; + + } catch (Exception e) { + return null; + } + } + + private Country extractCountry(String line1, String line2) { + try { + if (line1 != null) { + String fitted = normalizeLineLength(line1); + String code = substringSafely(fitted, 2, 5); + Country c = countryResolver.resolveIso3(code); + if (c != null) return c; + } + + if (line2 != null) { + String fitted = normalizeLineLength(line2); + String code = substringSafely(fitted, 10, 13); + return countryResolver.resolveIso3(code); + } + + return null; + + } catch (Exception e) { + return null; + } + } + + private LocalDate extractBirth(String line2) { + try { + if (line2 == null) return null; + + String fitted = normalizeLineLength(line2); + String birth = substringSafely(fitted, 13, 19); + + if (!birth.matches("\\d{6}")) return null; + + return DocumentDateUtils.parseLocalDate(birth); + + } catch (Exception e) { + return null; + } + } + + private String normalizeName(String v) { + return v.replace("<", " ") + .replaceAll("\\s+", " ") + .trim(); + } + + private record Mrz(String line1, String line2) {} + + public record PassportInfo( + String fullName, + Country country, + LocalDate birthDate + ) {} +} \ No newline at end of file diff --git a/src/main/java/org/sopt/kareer/domain/member/util/VisaOcrParser.java b/src/main/java/org/sopt/kareer/domain/member/util/VisaOcrParser.java new file mode 100644 index 0000000..ac1a9d9 --- /dev/null +++ b/src/main/java/org/sopt/kareer/domain/member/util/VisaOcrParser.java @@ -0,0 +1,156 @@ +package org.sopt.kareer.domain.member.util; + +import lombok.extern.slf4j.Slf4j; +import org.sopt.kareer.domain.member.entity.enums.VisaType; +import org.sopt.kareer.global.document.util.DocumentDateUtils; +import org.sopt.kareer.global.document.util.DocumentTextUtils; +import org.springframework.stereotype.Component; + +import java.time.LocalDate; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +@Slf4j +@Component +public class VisaOcrParser { + + private static final String DATE_REGEX = + "(\\d{4}[./-]\\d{1,2}[./-]\\d{1,2}|\\d{1,2}[./-]\\d{1,2}[./-]\\d{4}|\\d{1,2}\\s+[A-Z]{3}\\s+\\d{4})"; + + private static final Pattern DATE_PATTERN = Pattern.compile( + "\\b" + DATE_REGEX + "\\b", + Pattern.CASE_INSENSITIVE + ); + + private static final Pattern SUPPORTED_VISA_PATTERN = Pattern.compile( + "(?i)\\b(D\\s*-?\\s*2|D\\s*-?\\s*10|E\\s*-?\\s*7)\\b" + ); + + private static final Pattern START_DATE_PATTERN = Pattern.compile( + "(?i)(issue\\s*date|date\\s*of\\s*issue|grant\\s*date|발급일)\\s*[/|:]?\\s*" + DATE_REGEX + ); + + private static final Pattern EXPIRE_DATE_PATTERN = Pattern.compile( + "(?i)(final\\s*entry\\s*date|expiry\\s*date|expiration\\s*date|valid\\s*until|만료일|입국만료일)\\s*[/|:]?\\s*" + DATE_REGEX + ); + + public VisaInfo parse(String rawText) { + String text = DocumentTextUtils.normalize(rawText); + + VisaType visaType = extractVisaType(text); + LocalDate visaStartDate = extractVisaStartDate(text); + LocalDate visaExpiredAt = extractVisaExpiredAt(text); + + List allDates = extractAllDates(text); + + if (visaExpiredAt == null) { + visaExpiredAt = inferExpireDate(allDates, visaStartDate); + } + + if (visaExpiredAt == null) { + visaExpiredAt = extractExpireDateFromMrz(text); + } + + if (visaStartDate == null) { + visaStartDate = inferStartDate(allDates, visaExpiredAt); + } + + return new VisaInfo(visaType, visaStartDate, visaExpiredAt); + } + + private VisaType extractVisaType(String text) { + Matcher matcher = SUPPORTED_VISA_PATTERN.matcher(text); + if (matcher.find()) { + return VisaType.from(matcher.group(1)); + } + return null; + } + + private LocalDate extractVisaStartDate(String text) { + Matcher matcher = START_DATE_PATTERN.matcher(text); + if (matcher.find()) { + return DocumentDateUtils.parseDate(matcher.group(2)); + } + return null; + } + + private LocalDate extractVisaExpiredAt(String text) { + Matcher matcher = EXPIRE_DATE_PATTERN.matcher(text); + if (matcher.find()) { + return DocumentDateUtils.parseDate(matcher.group(2)); + } + return null; + } + + private List extractAllDates(String text) { + List dates = new ArrayList<>(); + Matcher matcher = DATE_PATTERN.matcher(text); + + while (matcher.find()) { + LocalDate parsed = DocumentDateUtils.parseDate(matcher.group(1)); + if (parsed != null) { + dates.add(parsed); + } + } + + return dates; + } + + private LocalDate inferStartDate(List dates, LocalDate expiredAt) { + if (dates.isEmpty()) { + return null; + } + + if (expiredAt != null) { + return dates.stream() + .filter(date -> date.isBefore(expiredAt)) + .max(LocalDate::compareTo) + .orElse(null); + } + + return dates.stream() + .min(LocalDate::compareTo) + .orElse(null); + } + + private LocalDate inferExpireDate(List dates, LocalDate startDate) { + if (dates.isEmpty()) { + return null; + } + + if (startDate != null) { + return dates.stream() + .filter(date -> date.isAfter(startDate)) + .max(LocalDate::compareTo) + .orElse(null); + } + + return dates.stream() + .max(Comparator.naturalOrder()) + .orElse(null); + } + + private LocalDate extractExpireDateFromMrz(String text) { + Pattern mrzPattern = Pattern.compile("[MF<](\\d{6})"); + Matcher matcher = mrzPattern.matcher(text); + + while (matcher.find()) { + LocalDate parsed = DocumentDateUtils.parseLocalDate(matcher.group(1)); + if (parsed != null) { + return parsed; + } + } + + return null; + } + + public record VisaInfo( + VisaType visaType, + LocalDate visaStartDate, + LocalDate visaExpiredAt + ) { + } +} \ No newline at end of file diff --git a/src/main/java/org/sopt/kareer/global/document/dto/response/PageText.java b/src/main/java/org/sopt/kareer/global/document/dto/response/PageText.java new file mode 100644 index 0000000..eb35b66 --- /dev/null +++ b/src/main/java/org/sopt/kareer/global/document/dto/response/PageText.java @@ -0,0 +1,3 @@ +package org.sopt.kareer.global.document.dto.response; + +public record PageText(int pageNumber, String text) {} diff --git a/src/main/java/org/sopt/kareer/global/document/exception/DocumentErrorCode.java b/src/main/java/org/sopt/kareer/global/document/exception/DocumentErrorCode.java new file mode 100644 index 0000000..f1538a8 --- /dev/null +++ b/src/main/java/org/sopt/kareer/global/document/exception/DocumentErrorCode.java @@ -0,0 +1,29 @@ +package org.sopt.kareer.global.document.exception; + +import lombok.RequiredArgsConstructor; +import org.sopt.kareer.global.exception.errorcode.ErrorCode; +import org.springframework.http.HttpStatus; + +@RequiredArgsConstructor +public enum DocumentErrorCode implements ErrorCode { + EXTRACT_TEXT_FAILED(HttpStatus.INTERNAL_SERVER_ERROR.value(), "PDF 파일에서 텍스트 추출에 실패하였습니다."), + INVALID_IMAGE_FILE(HttpStatus.BAD_REQUEST.value(), "유효하지 않은 이미지 파일입니다."), + FILE_EMPTY(HttpStatus.BAD_REQUEST.value(), "파일이 비어있습니다."), + UNSUPPORTED_FILE_TYPE(HttpStatus.BAD_REQUEST.value(), "지원하지 않는 파일 형식입니다."), + EXTRACT_IMAGE_FAILED(HttpStatus.INTERNAL_SERVER_ERROR.value(), "이미지 추출에 실패하였습니다."), + OCR_PROCESSING_FAILED(HttpStatus.INTERNAL_SERVER_ERROR.value(), "OCR 처리 중 오류가 발생했습니다."), + ; + + private final int httpStatus; + private final String message; + + @Override + public int getHttpStatus() { + return httpStatus; + } + + @Override + public String getMessage() { + return message; + } +} diff --git a/src/main/java/org/sopt/kareer/global/document/exception/DocumentException.java b/src/main/java/org/sopt/kareer/global/document/exception/DocumentException.java new file mode 100644 index 0000000..7333ab4 --- /dev/null +++ b/src/main/java/org/sopt/kareer/global/document/exception/DocumentException.java @@ -0,0 +1,13 @@ +package org.sopt.kareer.global.document.exception; + +import org.sopt.kareer.global.exception.customexception.CustomException; + +public class DocumentException extends CustomException { + public DocumentException(DocumentErrorCode errorCode) { + super(errorCode); + } + + public DocumentException(DocumentErrorCode errorCode, String message) { + super(errorCode, message); + } +} diff --git a/src/main/java/org/sopt/kareer/global/document/service/DocumentProcessingService.java b/src/main/java/org/sopt/kareer/global/document/service/DocumentProcessingService.java new file mode 100644 index 0000000..d4c2281 --- /dev/null +++ b/src/main/java/org/sopt/kareer/global/document/service/DocumentProcessingService.java @@ -0,0 +1,140 @@ +package org.sopt.kareer.global.document.service; + +import lombok.RequiredArgsConstructor; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.rendering.PDFRenderer; +import org.apache.pdfbox.text.PDFTextStripper; +import org.sopt.kareer.global.document.dto.response.PageText; +import org.sopt.kareer.global.document.exception.DocumentErrorCode; +import org.sopt.kareer.global.document.exception.DocumentException; +import org.sopt.kareer.global.external.clova.service.ClovaOcrService; +import org.springframework.stereotype.Service; +import org.springframework.web.multipart.MultipartFile; + +import javax.imageio.ImageIO; +import java.awt.image.BufferedImage; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; +import java.util.stream.Collectors; + +@Service +@RequiredArgsConstructor +public class DocumentProcessingService { + + private static final int OCR_DPI = 300; + private static final int MIN_TEXT_LENGTH = 20; + + private final ClovaOcrService clovaOcrService; + + public String extractText(MultipartFile file) throws IOException { + return extractPagesWithOcr(file).stream() + .map(PageText::text) + .collect(Collectors.joining("\n")); + } + + public List extractPagesWithOcr(MultipartFile file) throws IOException { + validate(file); + + String contentType = file.getContentType(); + String filename = file.getOriginalFilename(); + + if (isPdf(contentType, filename)) { + return extractPagesFromPdf(file); + } + + if (isImage(contentType, filename)) { + return extractPagesFromImage(file); + } + + throw new DocumentException(DocumentErrorCode.UNSUPPORTED_FILE_TYPE); + } + + private List extractPagesFromPdf(MultipartFile file) throws IOException { + try (PDDocument document = PDDocument.load(file.getInputStream())) { + PDFTextStripper stripper = new PDFTextStripper(); + stripper.setSortByPosition(true); + + PDFRenderer renderer = new PDFRenderer(document); + List pages = new ArrayList<>(); + + for (int i = 1; i <= document.getNumberOfPages(); i++) { + stripper.setStartPage(i); + stripper.setEndPage(i); + + String text = sanitizeText(stripper.getText(document)); + + if (text.isBlank() || text.length() < MIN_TEXT_LENGTH) { + BufferedImage image = renderer.renderImageWithDPI(i - 1, OCR_DPI); + text = sanitizeText(clovaOcrService.doOcr(image)); + } + + if (!text.isBlank()) { + pages.add(new PageText(i, text)); + } + } + + return pages; + } + } + + private List extractPagesFromImage(MultipartFile file) { + try { + BufferedImage image = ImageIO.read(file.getInputStream()); + + if (image == null) { + throw new DocumentException(DocumentErrorCode.INVALID_IMAGE_FILE, "유효하지 않은 이미지 파일입니다."); + } + + String text = sanitizeText(clovaOcrService.doOcr(image)); + + if (text.isBlank()) { + return List.of(); + } + + return List.of(new PageText(1, text)); + } catch (DocumentException e) { + throw e; + } catch (IOException e) { + throw new DocumentException(DocumentErrorCode.INVALID_IMAGE_FILE, e.getMessage()); + } catch (Exception e) { + throw new DocumentException(DocumentErrorCode.EXTRACT_IMAGE_FAILED, e.getMessage()); + } + } + + private void validate(MultipartFile file) { + if (file == null || file.isEmpty()) { + throw new DocumentException(DocumentErrorCode.FILE_EMPTY, "파일이 비어 있습니다."); + } + } + + private boolean isPdf(String contentType, String filename) { + return "application/pdf".equalsIgnoreCase(contentType) + || (filename != null && filename.toLowerCase().endsWith(".pdf")); + } + + private boolean isImage(String contentType, String filename) { + if (contentType != null && contentType.startsWith("image/")) { + return true; + } + + if (filename == null) { + return false; + } + + String lower = filename.toLowerCase(Locale.ROOT); + return lower.endsWith(".jpg") + || lower.endsWith(".jpeg") + || lower.endsWith(".png"); + } + + private static String sanitizeText(String s) { + return s == null ? "" : + s.replace("\u0000", "") + .replaceAll("[\\x01-\\x08\\x0B\\x0C\\x0E-\\x1F\\x7F]", " ") + .replace('\uFFFD', ' ') + .replaceAll("\\s+", " ") + .trim(); + } +} \ No newline at end of file diff --git a/src/main/java/org/sopt/kareer/global/document/util/DocumentDateUtils.java b/src/main/java/org/sopt/kareer/global/document/util/DocumentDateUtils.java new file mode 100644 index 0000000..945de24 --- /dev/null +++ b/src/main/java/org/sopt/kareer/global/document/util/DocumentDateUtils.java @@ -0,0 +1,68 @@ +package org.sopt.kareer.global.document.util; + +import lombok.AccessLevel; +import lombok.NoArgsConstructor; + +import java.time.LocalDate; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeFormatterBuilder; +import java.util.List; +import java.util.Locale; + +@NoArgsConstructor(access = AccessLevel.PRIVATE) +public class DocumentDateUtils { + private static final List DEFAULT_DATE_FORMATTERS = List.of( + DateTimeFormatter.ofPattern("yyyy-MM-dd"), + DateTimeFormatter.ofPattern("yyyy.MM.dd"), + DateTimeFormatter.ofPattern("yyyy/MM/dd"), + DateTimeFormatter.ofPattern("dd-MM-yyyy"), + DateTimeFormatter.ofPattern("dd.MM.yyyy"), + DateTimeFormatter.ofPattern("dd/MM/yyyy"), + new DateTimeFormatterBuilder() + .parseCaseInsensitive() + .appendPattern("d MMM yyyy") + .toFormatter(Locale.ENGLISH), + new DateTimeFormatterBuilder() + .parseCaseInsensitive() + .appendPattern("dd MMM yyyy") + .toFormatter(Locale.ENGLISH) + ); + + public static LocalDate parseDate(String raw) { + if (raw == null || raw.isBlank()) { + return null; + } + + String value = raw.trim() + .toUpperCase(Locale.ROOT) + .replaceAll("\\s+", " "); + + for (DateTimeFormatter formatter : DEFAULT_DATE_FORMATTERS) { + try { + return LocalDate.parse(value, formatter); + } catch (Exception ignored) { + } + } + + return null; + } + + public static LocalDate parseLocalDate(String value) { + try { + int yy = Integer.parseInt(value.substring(0, 2)); + int month = Integer.parseInt(value.substring(2, 4)); + int day = Integer.parseInt(value.substring(4, 6)); + + int year = 2000 + yy; + LocalDate date = LocalDate.of(year, month, day); + + if (date.isAfter(LocalDate.now())) { + date = date.minusYears(100); + } + + return date; + } catch (Exception e) { + return null; + } + } +} diff --git a/src/main/java/org/sopt/kareer/global/document/util/DocumentTextUtils.java b/src/main/java/org/sopt/kareer/global/document/util/DocumentTextUtils.java new file mode 100644 index 0000000..77c940e --- /dev/null +++ b/src/main/java/org/sopt/kareer/global/document/util/DocumentTextUtils.java @@ -0,0 +1,12 @@ +package org.sopt.kareer.global.document.util; + +import lombok.AccessLevel; +import lombok.NoArgsConstructor; + +@NoArgsConstructor(access = AccessLevel.PRIVATE) +public class DocumentTextUtils { + + public static String normalize(String text){ + return text == null ? "" : text.replaceAll("\\s+", " ").trim(); + } +} diff --git a/src/main/java/org/sopt/kareer/global/external/ai/service/RagEmbeddingService.java b/src/main/java/org/sopt/kareer/global/external/ai/service/RagEmbeddingService.java index 2c4810e..1346a36 100644 --- a/src/main/java/org/sopt/kareer/global/external/ai/service/RagEmbeddingService.java +++ b/src/main/java/org/sopt/kareer/global/external/ai/service/RagEmbeddingService.java @@ -6,6 +6,7 @@ import org.sopt.kareer.domain.jobposting.exception.JobPostingErrorCode; import org.sopt.kareer.domain.jobposting.exception.JobPostingException; import org.sopt.kareer.domain.jobposting.repository.JobPostingRepository; +import org.sopt.kareer.global.document.service.DocumentProcessingService; import org.sopt.kareer.global.external.ai.builder.JobPostingEmbeddingTextBuilder; import org.sopt.kareer.global.external.ai.dto.response.RequiredSection; import org.sopt.kareer.global.external.ai.enums.RequiredCategory; @@ -13,7 +14,6 @@ import org.sopt.kareer.global.external.ai.exception.RagException; import org.sopt.kareer.global.external.ai.util.OcrTextNormalizer; import org.sopt.kareer.global.external.ai.util.RequiredPdfParser; -import org.sopt.kareer.global.external.clova.service.DocumentProcessingService; import org.springframework.ai.document.Document; import org.springframework.ai.transformer.splitter.TokenTextSplitter; import org.springframework.ai.vectorstore.pgvector.PgVectorStore; @@ -21,7 +21,6 @@ import org.springframework.transaction.annotation.Transactional; import org.springframework.web.multipart.MultipartFile; -import java.io.File; import java.util.*; import static org.sopt.kareer.global.external.ai.constant.RequiredDocumentConstant.*; @@ -75,18 +74,13 @@ public void embedJobPosting(List jobPostingIds) { } private void uploadDocument(List files, PgVectorStore targetStore) { - File temp = null; - for (MultipartFile file : files) { try { - temp = File.createTempFile("upload_", ".pdf"); - file.transferTo(temp); - Map baseMeta = new HashMap<>(); baseMeta.put("originalFilename", Objects.toString(file.getOriginalFilename(), "")); baseMeta.put("uploadedAt", System.currentTimeMillis()); - var pages = documentProcessingService.extractPagesWithOcr(temp); + var pages = documentProcessingService.extractPagesWithOcr(file); List toStore = new ArrayList<>(); for (var page : pages) { @@ -100,8 +94,6 @@ private void uploadDocument(List files, PgVectorStore targetStore } catch (Exception e) { throw new RagException(RagErrorCode.EMBEDDING_FAILED, e.getMessage()); - } finally { - if (temp != null && temp.exists()) temp.delete(); } } } @@ -112,14 +104,8 @@ public void uploadRequiredDocument(MultipartFile file, RequiredCategory required } private void uploadAndIngest(MultipartFile file, String source, RequiredCategory category) { - if (file == null || file.isEmpty()) return; - - File temp = null; try { - temp = File.createTempFile("upload_", ".pdf"); - file.transferTo(temp); - - var pages = documentProcessingService.extractPagesWithOcr(temp); + var pages = documentProcessingService.extractPagesWithOcr(file); StringBuilder full = new StringBuilder(); for (var p : pages) { full.append(p.text()).append("\n"); @@ -149,8 +135,6 @@ private void uploadAndIngest(MultipartFile file, String source, RequiredCategory } catch (Exception e) { throw new RagException(RagErrorCode.EMBEDDING_FAILED, e.getMessage()); - } finally { - if (temp != null && temp.exists()) temp.delete(); } } diff --git a/src/main/java/org/sopt/kareer/global/external/clova/dto/response/PageText.java b/src/main/java/org/sopt/kareer/global/external/clova/dto/response/PageText.java deleted file mode 100644 index 4e012db..0000000 --- a/src/main/java/org/sopt/kareer/global/external/clova/dto/response/PageText.java +++ /dev/null @@ -1,3 +0,0 @@ -package org.sopt.kareer.global.external.clova.dto.response; - -public record PageText(int pageNumber, String text) {} diff --git a/src/main/java/org/sopt/kareer/global/external/clova/exception/ClovaErrorCode.java b/src/main/java/org/sopt/kareer/global/external/clova/exception/ClovaErrorCode.java new file mode 100644 index 0000000..0b8ce66 --- /dev/null +++ b/src/main/java/org/sopt/kareer/global/external/clova/exception/ClovaErrorCode.java @@ -0,0 +1,24 @@ +package org.sopt.kareer.global.external.clova.exception; + +import lombok.RequiredArgsConstructor; +import org.sopt.kareer.global.exception.errorcode.ErrorCode; +import org.springframework.http.HttpStatus; + +@RequiredArgsConstructor +public enum ClovaErrorCode implements ErrorCode { + EXTRACT_IMAGE_FAILED(HttpStatus.INTERNAL_SERVER_ERROR.value(), "이미지로부터 텍스트를 추출하는데 실패했습니다.") + ; + + private final int httpStatus; + private final String message; + + @Override + public int getHttpStatus() { + return httpStatus; + } + + @Override + public String getMessage() { + return message; + } +} diff --git a/src/main/java/org/sopt/kareer/global/external/clova/exception/ClovaException.java b/src/main/java/org/sopt/kareer/global/external/clova/exception/ClovaException.java new file mode 100644 index 0000000..1b89e60 --- /dev/null +++ b/src/main/java/org/sopt/kareer/global/external/clova/exception/ClovaException.java @@ -0,0 +1,13 @@ +package org.sopt.kareer.global.external.clova.exception; + +import org.sopt.kareer.global.exception.customexception.CustomException; + +public class ClovaException extends CustomException { + public ClovaException(ClovaErrorCode errorCode) { + super(errorCode); + } + + public ClovaException(ClovaErrorCode errorCode, String message) { + super(errorCode, message); + } +} diff --git a/src/main/java/org/sopt/kareer/global/external/clova/service/ClovaOcrService.java b/src/main/java/org/sopt/kareer/global/external/clova/service/ClovaOcrService.java index 9c00775..5c6ed84 100644 --- a/src/main/java/org/sopt/kareer/global/external/clova/service/ClovaOcrService.java +++ b/src/main/java/org/sopt/kareer/global/external/clova/service/ClovaOcrService.java @@ -1,16 +1,20 @@ package org.sopt.kareer.global.external.clova.service; import lombok.RequiredArgsConstructor; -import org.sopt.kareer.global.external.ai.exception.RagErrorCode; -import org.sopt.kareer.global.external.ai.exception.RagException; +import lombok.extern.slf4j.Slf4j; import org.sopt.kareer.global.external.clova.dto.request.ClovaOcrRequest; import org.sopt.kareer.global.external.clova.dto.response.ClovaOcrResponse; +import org.sopt.kareer.global.external.clova.exception.ClovaErrorCode; +import org.sopt.kareer.global.external.clova.exception.ClovaException; import org.springframework.beans.factory.annotation.Value; +import org.springframework.http.HttpStatusCode; import org.springframework.http.MediaType; import org.springframework.stereotype.Service; import org.springframework.web.reactive.function.client.WebClient; +import reactor.core.publisher.Mono; import javax.imageio.ImageIO; +import java.awt.*; import java.awt.image.BufferedImage; import java.io.ByteArrayOutputStream; import java.time.Duration; @@ -19,10 +23,13 @@ import java.util.UUID; import java.util.stream.Collectors; +@Slf4j @Service @RequiredArgsConstructor public class ClovaOcrService { + private static final String OCR_IMAGE_FORMAT = "png"; + private final WebClient clovaOcrWebClient; @Value("${spring.clova.ocr.timeout-ms:15000}") @@ -30,48 +37,120 @@ public class ClovaOcrService { public String doOcr(BufferedImage image) { try { - String base64 = Base64.getEncoder().encodeToString(toJpgBytes(image)); + byte[] imageBytes = toOcrBytes(image); + String base64 = Base64.getEncoder().encodeToString(imageBytes); ClovaOcrRequest body = new ClovaOcrRequest( "V2", UUID.randomUUID().toString(), System.currentTimeMillis(), - List.of(new ClovaOcrRequest.Image("jpg", "page", base64)) + List.of(new ClovaOcrRequest.Image(OCR_IMAGE_FORMAT, "page", base64)) ); ClovaOcrResponse response = clovaOcrWebClient.post() .uri("") .contentType(MediaType.APPLICATION_JSON) .bodyValue(body) - .retrieve() - .bodyToMono(ClovaOcrResponse.class) + .exchangeToMono(clientResponse -> { + HttpStatusCode status = clientResponse.statusCode(); + + if (status.is2xxSuccessful()) { + return clientResponse.bodyToMono(ClovaOcrResponse.class); + } + + return clientResponse.bodyToMono(String.class) + .defaultIfEmpty("") + .flatMap(errorBody -> { + log.error("CLOVA OCR error response. status={}, body={}", + status.value(), errorBody); + + return Mono.error( + new ClovaException( + ClovaErrorCode.EXTRACT_IMAGE_FAILED, + "CLOVA OCR error. status=" + status.value() + ", body=" + errorBody + ) + ); + }); + }) .block(Duration.ofMillis(timeoutMs)); - if (response == null || response.images() == null || response.images().isEmpty()) return ""; + if (response == null || response.images() == null || response.images().isEmpty()) { + log.warn("CLOVA OCR response is empty"); + return ""; + } var fields = response.images().get(0).fields(); - if (fields == null || fields.isEmpty()) return ""; + if (fields == null || fields.isEmpty()) { + log.warn("CLOVA OCR fields are empty"); + return ""; + } return fields.stream() .map(ClovaOcrResponse.Field::inferText) - .filter(s -> s != null && !s.isBlank()) + .filter(text -> text != null && !text.isBlank()) .map(String::trim) .collect(Collectors.joining(" ")); + } catch (ClovaException e) { + throw e; } catch (Exception e) { - throw new RagException( - RagErrorCode.EXTRACT_IMAGE_FAILED, + log.error("CLOVA OCR failed", e); + throw new ClovaException( + ClovaErrorCode.EXTRACT_IMAGE_FAILED, "CLOVA OCR failed: " + e.getMessage() ); } } - private byte[] toJpgBytes(BufferedImage image) { + private byte[] toOcrBytes(BufferedImage image) { try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { - ImageIO.write(image, "jpg", baos); + BufferedImage normalized = normalizeImage(image); + + boolean success = ImageIO.write(normalized, OCR_IMAGE_FORMAT, baos); + if (!success || baos.size() == 0) { + throw new ClovaException( + ClovaErrorCode.EXTRACT_IMAGE_FAILED, + "Image encoding failed" + ); + } + return baos.toByteArray(); + } catch (ClovaException e) { + throw e; } catch (Exception e) { - throw new RagException(RagErrorCode.EXTRACT_IMAGE_FAILED, "Image encoding failed: " + e.getMessage()); + throw new ClovaException( + ClovaErrorCode.EXTRACT_IMAGE_FAILED, + "Image encoding failed: " + e.getMessage() + ); + } + } + + /** + * 모든 입력 이미지를 OCR 전송용 표준 RGB 이미지로 정규화한다. + */ + private BufferedImage normalizeImage(BufferedImage source) { + if (source == null) { + throw new ClovaException( + ClovaErrorCode.EXTRACT_IMAGE_FAILED, + "Image is null" + ); + } + + BufferedImage target = new BufferedImage( + source.getWidth(), + source.getHeight(), + BufferedImage.TYPE_INT_RGB + ); + + Graphics2D g = target.createGraphics(); + try { + g.setColor(Color.WHITE); + g.fillRect(0, 0, target.getWidth(), target.getHeight()); + g.drawImage(source, 0, 0, null); + } finally { + g.dispose(); } + + return target; } } \ No newline at end of file diff --git a/src/main/java/org/sopt/kareer/global/external/clova/service/DocumentProcessingService.java b/src/main/java/org/sopt/kareer/global/external/clova/service/DocumentProcessingService.java deleted file mode 100644 index 1bcaa04..0000000 --- a/src/main/java/org/sopt/kareer/global/external/clova/service/DocumentProcessingService.java +++ /dev/null @@ -1,106 +0,0 @@ -package org.sopt.kareer.global.external.clova.service; - -import lombok.RequiredArgsConstructor; -import lombok.extern.slf4j.Slf4j; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.rendering.PDFRenderer; -import org.apache.pdfbox.text.PDFTextStripper; -import org.sopt.kareer.global.external.ai.exception.RagErrorCode; -import org.sopt.kareer.global.external.ai.exception.RagException; -import org.sopt.kareer.global.external.clova.dto.response.PageText; -import org.springframework.stereotype.Service; - -import java.awt.image.BufferedImage; -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; - -@Slf4j -@Service -@RequiredArgsConstructor -public class DocumentProcessingService { - - private static final double MIN_TEXT_PAGE_RATIO = 0.2; - private static final int OCR_DPI = 300; - - private final ClovaOcrService clovaOcrService; - - public List extractPagesWithOcr(File pdfFile) { - - List textPages = extractPageFromPdf(pdfFile); - int totalPages = getTotalPages(pdfFile); - - Map pageTextMap = textPages.stream() - .collect(Collectors.toMap(PageText::pageNumber, PageText::text)); - - try (PDDocument document = PDDocument.load(pdfFile)) { - PDFRenderer renderer = new PDFRenderer(document); - List pages = new ArrayList<>(totalPages); - - for (int i = 1; i <= totalPages; i++) { - String text = pageTextMap.getOrDefault(i, ""); - - if (textPages.size() < Math.max(1, (int) Math.ceil(totalPages * MIN_TEXT_PAGE_RATIO))) { - BufferedImage image = renderer.renderImageWithDPI(i - 1, OCR_DPI); - text = sanitizeText(clovaOcrService.doOcr(image)); - } - - if (!text.isBlank()) { - pages.add(new PageText(i, text)); - } - } - return pages; - - } catch (Exception e) { - throw new RagException(RagErrorCode.EXTRACT_IMAGE_FAILED, e.getMessage()); - } - } - - public String extractTextWithOcr(File pdfFile) { - return extractPagesWithOcr(pdfFile).stream() - .map(PageText::text) - .reduce("", (a, b) -> a + "\n" + b); - } - - private int getTotalPages(File pdfFile) { - try (PDDocument document = PDDocument.load(pdfFile)) { - return document.getNumberOfPages(); - } catch (IOException e) { - throw new RagException(RagErrorCode.EXTRACT_TEXT_FAILED, e.getMessage()); - } - } - - public List extractPageFromPdf(File pdfFile) { - try (PDDocument document = PDDocument.load(pdfFile)) { - PDFTextStripper stripper = new PDFTextStripper(); - stripper.setSortByPosition(true); - - int totalPages = document.getNumberOfPages(); - List pages = new ArrayList<>(totalPages); - - for (int i = 1; i <= totalPages; i++) { - stripper.setStartPage(i); - stripper.setEndPage(i); - - String cleanedText = sanitizeText(stripper.getText(document)); - if (!cleanedText.isBlank()) { - pages.add(new PageText(i, cleanedText)); - } - } - return pages; - } catch (IOException e) { - throw new RagException(RagErrorCode.EXTRACT_TEXT_FAILED, e.getMessage()); - } - } - - private static String sanitizeText(String s) { - return s == null ? "" : - s.replace("\u0000", "") - .replaceAll("[\\x01-\\x08\\x0B\\x0C\\x0E-\\x1F\\x7F]", " ") - .replace('\uFFFD', ' '); - } - -}