From 9d67bcb9290531b1f2d12380c582db123120a0fe Mon Sep 17 00:00:00 2001 From: dlwjddus1112 Date: Thu, 26 Mar 2026 13:45:00 +0900 Subject: [PATCH 01/20] =?UTF-8?q?[Feat]=20#204=20OCR=20=EC=A0=84=EC=9A=A9?= =?UTF-8?q?=20ErrorCode,=20=EC=98=88=EC=99=B8=20=ED=81=B4=EB=9E=98?= =?UTF-8?q?=EC=8A=A4=20=EA=B5=AC=ED=98=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../clova/exception/ClovaErrorCode.java | 24 +++++++++++++++++++ .../clova/exception/ClovaException.java | 13 ++++++++++ 2 files changed, 37 insertions(+) create mode 100644 src/main/java/org/sopt/kareer/global/external/clova/exception/ClovaErrorCode.java create mode 100644 src/main/java/org/sopt/kareer/global/external/clova/exception/ClovaException.java diff --git a/src/main/java/org/sopt/kareer/global/external/clova/exception/ClovaErrorCode.java b/src/main/java/org/sopt/kareer/global/external/clova/exception/ClovaErrorCode.java new file mode 100644 index 0000000..0b8ce66 --- /dev/null +++ b/src/main/java/org/sopt/kareer/global/external/clova/exception/ClovaErrorCode.java @@ -0,0 +1,24 @@ +package org.sopt.kareer.global.external.clova.exception; + +import lombok.RequiredArgsConstructor; +import org.sopt.kareer.global.exception.errorcode.ErrorCode; +import org.springframework.http.HttpStatus; + +@RequiredArgsConstructor +public enum ClovaErrorCode implements ErrorCode { + EXTRACT_IMAGE_FAILED(HttpStatus.INTERNAL_SERVER_ERROR.value(), "이미지로부터 텍스트를 추출하는데 실패했습니다.") + ; + + private final int httpStatus; + private final String message; + + @Override + public int getHttpStatus() { + return httpStatus; + } + + @Override + public String getMessage() { + return message; + } +} diff --git a/src/main/java/org/sopt/kareer/global/external/clova/exception/ClovaException.java b/src/main/java/org/sopt/kareer/global/external/clova/exception/ClovaException.java new file mode 100644 index 0000000..1b89e60 --- /dev/null +++ b/src/main/java/org/sopt/kareer/global/external/clova/exception/ClovaException.java @@ -0,0 +1,13 @@ +package org.sopt.kareer.global.external.clova.exception; + +import org.sopt.kareer.global.exception.customexception.CustomException; + +public class ClovaException extends CustomException { + public ClovaException(ClovaErrorCode errorCode) { + super(errorCode); + } + + public ClovaException(ClovaErrorCode errorCode, String message) { + super(errorCode, message); + } +} From 8ef6754fb87590029e921677f603476c8e202a15 Mon Sep 17 00:00:00 2001 From: dlwjddus1112 Date: Thu, 26 Mar 2026 13:45:34 +0900 Subject: [PATCH 02/20] =?UTF-8?q?[Refactor]=20#204=20OCR=20=EA=B3=BC?= =?UTF-8?q?=EC=A0=95=EC=97=90=EC=84=9C=20=EB=B0=9C=EC=83=9D=ED=95=9C=20?= =?UTF-8?q?=EC=98=88=EC=99=B8=EA=B0=80=20ClovaException=EC=9D=84=20?= =?UTF-8?q?=EB=8D=98=EC=A7=80=EB=8F=84=EB=A1=9D=20=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../global/external/clova/service/ClovaOcrService.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/main/java/org/sopt/kareer/global/external/clova/service/ClovaOcrService.java b/src/main/java/org/sopt/kareer/global/external/clova/service/ClovaOcrService.java index 9c00775..0d09567 100644 --- a/src/main/java/org/sopt/kareer/global/external/clova/service/ClovaOcrService.java +++ b/src/main/java/org/sopt/kareer/global/external/clova/service/ClovaOcrService.java @@ -1,10 +1,10 @@ package org.sopt.kareer.global.external.clova.service; import lombok.RequiredArgsConstructor; -import org.sopt.kareer.global.external.ai.exception.RagErrorCode; -import org.sopt.kareer.global.external.ai.exception.RagException; import org.sopt.kareer.global.external.clova.dto.request.ClovaOcrRequest; import org.sopt.kareer.global.external.clova.dto.response.ClovaOcrResponse; +import org.sopt.kareer.global.external.clova.exception.ClovaErrorCode; +import org.sopt.kareer.global.external.clova.exception.ClovaException; import org.springframework.beans.factory.annotation.Value; import org.springframework.http.MediaType; import org.springframework.stereotype.Service; @@ -59,8 +59,8 @@ public String doOcr(BufferedImage image) { .collect(Collectors.joining(" ")); } catch (Exception e) { - throw new RagException( - RagErrorCode.EXTRACT_IMAGE_FAILED, + throw new ClovaException( + ClovaErrorCode.EXTRACT_IMAGE_FAILED, "CLOVA OCR failed: " + e.getMessage() ); } @@ -71,7 +71,7 @@ private byte[] toJpgBytes(BufferedImage image) { ImageIO.write(image, "jpg", baos); return baos.toByteArray(); } catch (Exception e) { - throw new RagException(RagErrorCode.EXTRACT_IMAGE_FAILED, "Image encoding failed: " + e.getMessage()); + throw new ClovaException(ClovaErrorCode.EXTRACT_IMAGE_FAILED, "Image encoding failed: " + e.getMessage()); } } } \ No newline at end of file From 62ff2d88a8993c948b2ffd386daf92fa1c8428f0 Mon Sep 17 00:00:00 2001 From: dlwjddus1112 Date: Thu, 26 Mar 2026 14:22:14 +0900 Subject: [PATCH 03/20] =?UTF-8?q?[Feat]=20#204=20=EB=AC=B8=EC=84=9C=20?= =?UTF-8?q?=EC=B2=98=EB=A6=AC=EC=9A=A9=20=EC=98=88=EC=99=B8=20=ED=81=B4?= =?UTF-8?q?=EB=9E=98=EC=8A=A4,=20errorCode=20=EA=B5=AC=ED=98=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../document/exception/DocumentErrorCode.java | 24 +++++++++++++++++++ .../document/exception/DocumentException.java | 13 ++++++++++ 2 files changed, 37 insertions(+) create mode 100644 src/main/java/org/sopt/kareer/global/document/exception/DocumentErrorCode.java create mode 100644 src/main/java/org/sopt/kareer/global/document/exception/DocumentException.java diff --git a/src/main/java/org/sopt/kareer/global/document/exception/DocumentErrorCode.java b/src/main/java/org/sopt/kareer/global/document/exception/DocumentErrorCode.java new file mode 100644 index 0000000..9cbafce --- /dev/null +++ b/src/main/java/org/sopt/kareer/global/document/exception/DocumentErrorCode.java @@ -0,0 +1,24 @@ +package org.sopt.kareer.global.document.exception; + +import lombok.RequiredArgsConstructor; +import org.sopt.kareer.global.exception.errorcode.ErrorCode; +import org.springframework.http.HttpStatus; + +@RequiredArgsConstructor +public enum DocumentErrorCode implements ErrorCode { + EXTRACT_TEXT_FAILED(HttpStatus.INTERNAL_SERVER_ERROR.value(), "PDF 파일에서 텍스트 추출에 실패하였습니다."), + ; + + private final int httpStatus; + private final String message; + + @Override + public int getHttpStatus() { + return httpStatus; + } + + @Override + public String getMessage() { + return message; + } +} diff --git a/src/main/java/org/sopt/kareer/global/document/exception/DocumentException.java b/src/main/java/org/sopt/kareer/global/document/exception/DocumentException.java new file mode 100644 index 0000000..7333ab4 --- /dev/null +++ b/src/main/java/org/sopt/kareer/global/document/exception/DocumentException.java @@ -0,0 +1,13 @@ +package org.sopt.kareer.global.document.exception; + +import org.sopt.kareer.global.exception.customexception.CustomException; + +public class DocumentException extends CustomException { + public DocumentException(DocumentErrorCode errorCode) { + super(errorCode); + } + + public DocumentException(DocumentErrorCode errorCode, String message) { + super(errorCode, message); + } +} From 89d1164b3b7209226e6fa21a55dc7ce217381d6e Mon Sep 17 00:00:00 2001 From: dlwjddus1112 Date: Thu, 26 Mar 2026 14:22:43 +0900 Subject: [PATCH 04/20] =?UTF-8?q?[Refactor]=20#204=20=EB=AC=B8=EC=84=9C=20?= =?UTF-8?q?=EC=B2=98=EB=A6=AC=20=EB=8B=B4=EB=8B=B9=20=ED=81=B4=EB=9E=98?= =?UTF-8?q?=EC=8A=A4=20=ED=8C=A8=ED=82=A4=EC=A7=80=20=EC=9D=B4=EB=8F=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../jobposting/util/ResumeContextService.java | 2 +- .../global/document/dto/response/PageText.java | 3 +++ .../service/DocumentProcessingService.java | 17 ++++++++++------- .../ai/service/RagEmbeddingService.java | 2 +- .../external/clova/dto/response/PageText.java | 3 --- 5 files changed, 15 insertions(+), 12 deletions(-) create mode 100644 src/main/java/org/sopt/kareer/global/document/dto/response/PageText.java rename src/main/java/org/sopt/kareer/global/{external/clova => document}/service/DocumentProcessingService.java (81%) delete mode 100644 src/main/java/org/sopt/kareer/global/external/clova/dto/response/PageText.java diff --git a/src/main/java/org/sopt/kareer/domain/jobposting/util/ResumeContextService.java b/src/main/java/org/sopt/kareer/domain/jobposting/util/ResumeContextService.java index 0391086..cb874d3 100644 --- a/src/main/java/org/sopt/kareer/domain/jobposting/util/ResumeContextService.java +++ b/src/main/java/org/sopt/kareer/domain/jobposting/util/ResumeContextService.java @@ -2,7 +2,7 @@ import lombok.RequiredArgsConstructor; import org.sopt.kareer.domain.jobposting.exception.JobPostingException; -import org.sopt.kareer.global.external.clova.service.DocumentProcessingService; +import org.sopt.kareer.global.document.service.DocumentProcessingService; import org.springframework.stereotype.Service; import org.springframework.web.multipart.MultipartFile; diff --git a/src/main/java/org/sopt/kareer/global/document/dto/response/PageText.java b/src/main/java/org/sopt/kareer/global/document/dto/response/PageText.java new file mode 100644 index 0000000..eb35b66 --- /dev/null +++ b/src/main/java/org/sopt/kareer/global/document/dto/response/PageText.java @@ -0,0 +1,3 @@ +package org.sopt.kareer.global.document.dto.response; + +public record PageText(int pageNumber, String text) {} diff --git a/src/main/java/org/sopt/kareer/global/external/clova/service/DocumentProcessingService.java b/src/main/java/org/sopt/kareer/global/document/service/DocumentProcessingService.java similarity index 81% rename from src/main/java/org/sopt/kareer/global/external/clova/service/DocumentProcessingService.java rename to src/main/java/org/sopt/kareer/global/document/service/DocumentProcessingService.java index 1bcaa04..12cb5bc 100644 --- a/src/main/java/org/sopt/kareer/global/external/clova/service/DocumentProcessingService.java +++ b/src/main/java/org/sopt/kareer/global/document/service/DocumentProcessingService.java @@ -1,13 +1,16 @@ -package org.sopt.kareer.global.external.clova.service; +package org.sopt.kareer.global.document.service; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.rendering.PDFRenderer; import org.apache.pdfbox.text.PDFTextStripper; -import org.sopt.kareer.global.external.ai.exception.RagErrorCode; -import org.sopt.kareer.global.external.ai.exception.RagException; -import org.sopt.kareer.global.external.clova.dto.response.PageText; +import org.sopt.kareer.global.document.dto.response.PageText; +import org.sopt.kareer.global.document.exception.DocumentErrorCode; +import org.sopt.kareer.global.document.exception.DocumentException; +import org.sopt.kareer.global.external.clova.exception.ClovaErrorCode; +import org.sopt.kareer.global.external.clova.exception.ClovaException; +import org.sopt.kareer.global.external.clova.service.ClovaOcrService; import org.springframework.stereotype.Service; import java.awt.image.BufferedImage; @@ -55,7 +58,7 @@ public List extractPagesWithOcr(File pdfFile) { return pages; } catch (Exception e) { - throw new RagException(RagErrorCode.EXTRACT_IMAGE_FAILED, e.getMessage()); + throw new ClovaException(ClovaErrorCode.EXTRACT_IMAGE_FAILED, e.getMessage()); } } @@ -69,7 +72,7 @@ private int getTotalPages(File pdfFile) { try (PDDocument document = PDDocument.load(pdfFile)) { return document.getNumberOfPages(); } catch (IOException e) { - throw new RagException(RagErrorCode.EXTRACT_TEXT_FAILED, e.getMessage()); + throw new DocumentException(DocumentErrorCode.EXTRACT_TEXT_FAILED, e.getMessage()); } } @@ -92,7 +95,7 @@ public List extractPageFromPdf(File pdfFile) { } return pages; } catch (IOException e) { - throw new RagException(RagErrorCode.EXTRACT_TEXT_FAILED, e.getMessage()); + throw new DocumentException(DocumentErrorCode.EXTRACT_TEXT_FAILED, e.getMessage()); } } diff --git a/src/main/java/org/sopt/kareer/global/external/ai/service/RagEmbeddingService.java b/src/main/java/org/sopt/kareer/global/external/ai/service/RagEmbeddingService.java index 2c4810e..463b08a 100644 --- a/src/main/java/org/sopt/kareer/global/external/ai/service/RagEmbeddingService.java +++ b/src/main/java/org/sopt/kareer/global/external/ai/service/RagEmbeddingService.java @@ -6,6 +6,7 @@ import org.sopt.kareer.domain.jobposting.exception.JobPostingErrorCode; import org.sopt.kareer.domain.jobposting.exception.JobPostingException; import org.sopt.kareer.domain.jobposting.repository.JobPostingRepository; +import org.sopt.kareer.global.document.service.DocumentProcessingService; import org.sopt.kareer.global.external.ai.builder.JobPostingEmbeddingTextBuilder; import org.sopt.kareer.global.external.ai.dto.response.RequiredSection; import org.sopt.kareer.global.external.ai.enums.RequiredCategory; @@ -13,7 +14,6 @@ import org.sopt.kareer.global.external.ai.exception.RagException; import org.sopt.kareer.global.external.ai.util.OcrTextNormalizer; import org.sopt.kareer.global.external.ai.util.RequiredPdfParser; -import org.sopt.kareer.global.external.clova.service.DocumentProcessingService; import org.springframework.ai.document.Document; import org.springframework.ai.transformer.splitter.TokenTextSplitter; import org.springframework.ai.vectorstore.pgvector.PgVectorStore; diff --git a/src/main/java/org/sopt/kareer/global/external/clova/dto/response/PageText.java b/src/main/java/org/sopt/kareer/global/external/clova/dto/response/PageText.java deleted file mode 100644 index 4e012db..0000000 --- a/src/main/java/org/sopt/kareer/global/external/clova/dto/response/PageText.java +++ /dev/null @@ -1,3 +0,0 @@ -package org.sopt.kareer.global.external.clova.dto.response; - -public record PageText(int pageNumber, String text) {} From 5934326abf22f64ed02db6117697a0128bddf78d Mon Sep 17 00:00:00 2001 From: dlwjddus1112 Date: Thu, 26 Mar 2026 14:50:20 +0900 Subject: [PATCH 05/20] =?UTF-8?q?[Refactor]=20#204=20=EB=AC=B8=EC=84=9C?= =?UTF-8?q?=EC=97=90=EC=84=9C=20=ED=85=8D=EC=8A=A4=ED=8A=B8=20=EC=B6=94?= =?UTF-8?q?=EC=B6=9C=20=ED=8C=8C=EC=9D=BC=20=ED=98=95=EC=8B=9D=EC=97=90=20?= =?UTF-8?q?=EB=94=B0=EB=9D=BC=20=EB=B6=84=EA=B8=B0=EC=B2=98=EB=A6=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../jobposting/util/ResumeContextService.java | 12 +- .../document/exception/DocumentErrorCode.java | 3 + .../service/DocumentProcessingService.java | 120 ++++++++++-------- .../ai/service/RagEmbeddingService.java | 20 +-- 4 files changed, 74 insertions(+), 81 deletions(-) diff --git a/src/main/java/org/sopt/kareer/domain/jobposting/util/ResumeContextService.java b/src/main/java/org/sopt/kareer/domain/jobposting/util/ResumeContextService.java index cb874d3..fc2dc1b 100644 --- a/src/main/java/org/sopt/kareer/domain/jobposting/util/ResumeContextService.java +++ b/src/main/java/org/sopt/kareer/domain/jobposting/util/ResumeContextService.java @@ -6,7 +6,6 @@ import org.springframework.stereotype.Service; import org.springframework.web.multipart.MultipartFile; -import java.io.File; import java.util.List; import static org.sopt.kareer.domain.jobposting.exception.JobPostingErrorCode.RESUME_CONTEXT_FAILED; @@ -32,13 +31,8 @@ public String buildContext(List files) { sb.append("[RESUME_COVER_LETTER]\n"); for (MultipartFile file : files) { - File temp = null; - try { - temp = File.createTempFile("resume_", ".pdf"); - file.transferTo(temp); - - String text = documentProcessingService.extractTextWithOcr(temp); + String text = documentProcessingService.extractText(file); sb.append("----- FILE START -----\n"); sb.append(text).append("\n"); @@ -46,10 +40,6 @@ public String buildContext(List files) { } catch (Exception e) { throw new JobPostingException(RESUME_CONTEXT_FAILED, e.getMessage()); - } finally { - if (temp != null && temp.exists()) { - temp.delete(); - } } } diff --git a/src/main/java/org/sopt/kareer/global/document/exception/DocumentErrorCode.java b/src/main/java/org/sopt/kareer/global/document/exception/DocumentErrorCode.java index 9cbafce..13cfe0c 100644 --- a/src/main/java/org/sopt/kareer/global/document/exception/DocumentErrorCode.java +++ b/src/main/java/org/sopt/kareer/global/document/exception/DocumentErrorCode.java @@ -7,6 +7,9 @@ @RequiredArgsConstructor public enum DocumentErrorCode implements ErrorCode { EXTRACT_TEXT_FAILED(HttpStatus.INTERNAL_SERVER_ERROR.value(), "PDF 파일에서 텍스트 추출에 실패하였습니다."), + INVALID_IMAGE_FILE(HttpStatus.BAD_REQUEST.value(), "유효하지 않은 이미지 파일입니다."), + FILE_EMPTY(HttpStatus.BAD_REQUEST.value(), "파일이 비어있습니다."), + UNSUPPORTED_FILE_TYPE(HttpStatus.BAD_REQUEST.value(), "지원하지 않는 파일 형식입니다."), ; private final int httpStatus; diff --git a/src/main/java/org/sopt/kareer/global/document/service/DocumentProcessingService.java b/src/main/java/org/sopt/kareer/global/document/service/DocumentProcessingService.java index 12cb5bc..66df032 100644 --- a/src/main/java/org/sopt/kareer/global/document/service/DocumentProcessingService.java +++ b/src/main/java/org/sopt/kareer/global/document/service/DocumentProcessingService.java @@ -1,52 +1,76 @@ package org.sopt.kareer.global.document.service; import lombok.RequiredArgsConstructor; -import lombok.extern.slf4j.Slf4j; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.rendering.PDFRenderer; import org.apache.pdfbox.text.PDFTextStripper; import org.sopt.kareer.global.document.dto.response.PageText; import org.sopt.kareer.global.document.exception.DocumentErrorCode; import org.sopt.kareer.global.document.exception.DocumentException; -import org.sopt.kareer.global.external.clova.exception.ClovaErrorCode; -import org.sopt.kareer.global.external.clova.exception.ClovaException; import org.sopt.kareer.global.external.clova.service.ClovaOcrService; import org.springframework.stereotype.Service; +import org.springframework.web.multipart.MultipartFile; +import javax.imageio.ImageIO; import java.awt.image.BufferedImage; -import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; -import java.util.Map; import java.util.stream.Collectors; -@Slf4j @Service @RequiredArgsConstructor public class DocumentProcessingService { - private static final double MIN_TEXT_PAGE_RATIO = 0.2; private static final int OCR_DPI = 300; + private static final int MIN_TEXT_LENGTH = 20; private final ClovaOcrService clovaOcrService; - public List extractPagesWithOcr(File pdfFile) { + public String extractText(MultipartFile file) { + return extractPagesWithOcr(file).stream() + .map(PageText::text) + .collect(Collectors.joining("\n")); + } + + public List extractPagesWithOcr(MultipartFile file) { + validate(file); + + String contentType = file.getContentType(); + String filename = file.getOriginalFilename(); + + try { + if (isPdf(contentType, filename)) { + return extractPagesFromPdf(file); + } + + if (isImage(contentType)) { + return extractPagesFromImage(file); + } - List textPages = extractPageFromPdf(pdfFile); - int totalPages = getTotalPages(pdfFile); + throw new DocumentException(DocumentErrorCode.UNSUPPORTED_FILE_TYPE); + } catch (DocumentException e) { + throw e; + } catch (Exception e) { + throw new DocumentException(DocumentErrorCode.EXTRACT_TEXT_FAILED, e.getMessage()); + } + } - Map pageTextMap = textPages.stream() - .collect(Collectors.toMap(PageText::pageNumber, PageText::text)); + private List extractPagesFromPdf(MultipartFile file) throws IOException { + try (PDDocument document = PDDocument.load(file.getInputStream())) { + PDFTextStripper stripper = new PDFTextStripper(); + stripper.setSortByPosition(true); - try (PDDocument document = PDDocument.load(pdfFile)) { PDFRenderer renderer = new PDFRenderer(document); - List pages = new ArrayList<>(totalPages); + List pages = new ArrayList<>(); + + for (int i = 1; i <= document.getNumberOfPages(); i++) { + stripper.setStartPage(i); + stripper.setEndPage(i); - for (int i = 1; i <= totalPages; i++) { - String text = pageTextMap.getOrDefault(i, ""); + String text = sanitizeText(stripper.getText(document)); - if (textPages.size() < Math.max(1, (int) Math.ceil(totalPages * MIN_TEXT_PAGE_RATIO))) { + if (text.isBlank() || text.length() < MIN_TEXT_LENGTH) { BufferedImage image = renderer.renderImageWithDPI(i - 1, OCR_DPI); text = sanitizeText(clovaOcrService.doOcr(image)); } @@ -55,55 +79,47 @@ public List extractPagesWithOcr(File pdfFile) { pages.add(new PageText(i, text)); } } - return pages; - } catch (Exception e) { - throw new ClovaException(ClovaErrorCode.EXTRACT_IMAGE_FAILED, e.getMessage()); + return pages; } } - public String extractTextWithOcr(File pdfFile) { - return extractPagesWithOcr(pdfFile).stream() - .map(PageText::text) - .reduce("", (a, b) -> a + "\n" + b); - } - - private int getTotalPages(File pdfFile) { - try (PDDocument document = PDDocument.load(pdfFile)) { - return document.getNumberOfPages(); - } catch (IOException e) { - throw new DocumentException(DocumentErrorCode.EXTRACT_TEXT_FAILED, e.getMessage()); + private List extractPagesFromImage(MultipartFile file) throws IOException { + BufferedImage image = ImageIO.read(file.getInputStream()); + if (image == null) { + throw new DocumentException(DocumentErrorCode.INVALID_IMAGE_FILE); } - } - public List extractPageFromPdf(File pdfFile) { - try (PDDocument document = PDDocument.load(pdfFile)) { - PDFTextStripper stripper = new PDFTextStripper(); - stripper.setSortByPosition(true); + String text = sanitizeText(clovaOcrService.doOcr(image)); - int totalPages = document.getNumberOfPages(); - List pages = new ArrayList<>(totalPages); + if (text.isBlank()) { + return List.of(); + } - for (int i = 1; i <= totalPages; i++) { - stripper.setStartPage(i); - stripper.setEndPage(i); + return List.of(new PageText(1, text)); + } - String cleanedText = sanitizeText(stripper.getText(document)); - if (!cleanedText.isBlank()) { - pages.add(new PageText(i, cleanedText)); - } - } - return pages; - } catch (IOException e) { - throw new DocumentException(DocumentErrorCode.EXTRACT_TEXT_FAILED, e.getMessage()); + private void validate(MultipartFile file) { + if (file == null || file.isEmpty()) { + throw new DocumentException(DocumentErrorCode.FILE_EMPTY, "파일이 비어 있습니다."); } } + private boolean isPdf(String contentType, String filename) { + return "application/pdf".equalsIgnoreCase(contentType) + || (filename != null && filename.toLowerCase().endsWith(".pdf")); + } + + private boolean isImage(String contentType) { + return contentType != null && contentType.startsWith("image/"); + } + private static String sanitizeText(String s) { return s == null ? "" : s.replace("\u0000", "") .replaceAll("[\\x01-\\x08\\x0B\\x0C\\x0E-\\x1F\\x7F]", " ") - .replace('\uFFFD', ' '); + .replace('\uFFFD', ' ') + .replaceAll("\\s+", " ") + .trim(); } - -} +} \ No newline at end of file diff --git a/src/main/java/org/sopt/kareer/global/external/ai/service/RagEmbeddingService.java b/src/main/java/org/sopt/kareer/global/external/ai/service/RagEmbeddingService.java index 463b08a..1346a36 100644 --- a/src/main/java/org/sopt/kareer/global/external/ai/service/RagEmbeddingService.java +++ b/src/main/java/org/sopt/kareer/global/external/ai/service/RagEmbeddingService.java @@ -21,7 +21,6 @@ import org.springframework.transaction.annotation.Transactional; import org.springframework.web.multipart.MultipartFile; -import java.io.File; import java.util.*; import static org.sopt.kareer.global.external.ai.constant.RequiredDocumentConstant.*; @@ -75,18 +74,13 @@ public void embedJobPosting(List jobPostingIds) { } private void uploadDocument(List files, PgVectorStore targetStore) { - File temp = null; - for (MultipartFile file : files) { try { - temp = File.createTempFile("upload_", ".pdf"); - file.transferTo(temp); - Map baseMeta = new HashMap<>(); baseMeta.put("originalFilename", Objects.toString(file.getOriginalFilename(), "")); baseMeta.put("uploadedAt", System.currentTimeMillis()); - var pages = documentProcessingService.extractPagesWithOcr(temp); + var pages = documentProcessingService.extractPagesWithOcr(file); List toStore = new ArrayList<>(); for (var page : pages) { @@ -100,8 +94,6 @@ private void uploadDocument(List files, PgVectorStore targetStore } catch (Exception e) { throw new RagException(RagErrorCode.EMBEDDING_FAILED, e.getMessage()); - } finally { - if (temp != null && temp.exists()) temp.delete(); } } } @@ -112,14 +104,8 @@ public void uploadRequiredDocument(MultipartFile file, RequiredCategory required } private void uploadAndIngest(MultipartFile file, String source, RequiredCategory category) { - if (file == null || file.isEmpty()) return; - - File temp = null; try { - temp = File.createTempFile("upload_", ".pdf"); - file.transferTo(temp); - - var pages = documentProcessingService.extractPagesWithOcr(temp); + var pages = documentProcessingService.extractPagesWithOcr(file); StringBuilder full = new StringBuilder(); for (var p : pages) { full.append(p.text()).append("\n"); @@ -149,8 +135,6 @@ private void uploadAndIngest(MultipartFile file, String source, RequiredCategory } catch (Exception e) { throw new RagException(RagErrorCode.EMBEDDING_FAILED, e.getMessage()); - } finally { - if (temp != null && temp.exists()) temp.delete(); } } From 293a86af55e121e5d67f21678fb9d2a5a7fb5d8c Mon Sep 17 00:00:00 2001 From: dlwjddus1112 Date: Thu, 26 Mar 2026 15:06:54 +0900 Subject: [PATCH 06/20] =?UTF-8?q?[Feat]=20#204=20VisaType=EC=97=90=20?= =?UTF-8?q?=EB=A7=A4=ED=95=91=20=EB=A9=94=EC=84=9C=EB=93=9C=20=EC=B6=94?= =?UTF-8?q?=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../domain/member/entity/enums/VisaType.java | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/main/java/org/sopt/kareer/domain/member/entity/enums/VisaType.java b/src/main/java/org/sopt/kareer/domain/member/entity/enums/VisaType.java index 61b4282..1efb8b8 100644 --- a/src/main/java/org/sopt/kareer/domain/member/entity/enums/VisaType.java +++ b/src/main/java/org/sopt/kareer/domain/member/entity/enums/VisaType.java @@ -12,4 +12,25 @@ public enum VisaType { ; private final String description; + + public static VisaType from(String originalText) { + if (originalText == null || originalText.isBlank()) { + return null; + } + + String normalized = normalize(originalText); + + for (VisaType visaType : values()) { + if (normalize(visaType.name()).equals(normalized) + || normalize(visaType.description).equals(normalized)) { + return visaType; + } + } + + return null; + } + + private static String normalize(String value) { + return value.replaceAll("[^A-Za-z0-9]", "").toUpperCase(); + } } From f5e3273d4009f4786c7747523bfc83dbb03cf8e2 Mon Sep 17 00:00:00 2001 From: dlwjddus1112 Date: Thu, 26 Mar 2026 15:20:57 +0900 Subject: [PATCH 07/20] =?UTF-8?q?[Feat]=20#204=20=EB=B9=84=EC=9E=90=20?= =?UTF-8?q?=EB=AC=B8=EC=84=9C=EB=A1=9C=EB=B6=80=ED=84=B0=20=EB=B9=84?= =?UTF-8?q?=EC=9E=90=20=EC=A0=95=EB=B3=B4=20=EC=B6=94=EC=B6=9C=20=EB=A1=9C?= =?UTF-8?q?=EC=A7=81=20=EA=B5=AC=ED=98=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../domain/member/util/VisaOcrParser.java | 219 ++++++++++++++++++ 1 file changed, 219 insertions(+) create mode 100644 src/main/java/org/sopt/kareer/domain/member/util/VisaOcrParser.java diff --git a/src/main/java/org/sopt/kareer/domain/member/util/VisaOcrParser.java b/src/main/java/org/sopt/kareer/domain/member/util/VisaOcrParser.java new file mode 100644 index 0000000..b5fdbcf --- /dev/null +++ b/src/main/java/org/sopt/kareer/domain/member/util/VisaOcrParser.java @@ -0,0 +1,219 @@ +package org.sopt.kareer.domain.member.util; + +import lombok.extern.slf4j.Slf4j; +import org.sopt.kareer.domain.member.entity.enums.VisaType; +import org.springframework.stereotype.Component; + +import java.time.LocalDate; +import java.time.format.DateTimeFormatter; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +@Slf4j +@Component +public class VisaOcrParser { + + private static final Pattern DATE_PATTERN = Pattern.compile( + "\\b(" + + "\\d{4}[./-]\\d{1,2}[./-]\\d{1,2}" + + "|" + + "\\d{1,2}[./-]\\d{1,2}[./-]\\d{4}" + + "|" + + "\\d{1,2}\\s+[A-Z]{3}\\s+\\d{4}" + + ")\\b", + Pattern.CASE_INSENSITIVE + ); + + // 비자유형 파싱용 + private static final Pattern STATUS_PATTERN = Pattern.compile( + "(?i)(status|체류자격)\\s*[::]?\\s*([A-Z]\\s*-?\\s*\\d{1,2})" + ); + + private static final Pattern START_DATE_PATTERN = Pattern.compile( + "(?i)(issue\\s*date|date\\s*of\\s*issue|grant\\s*date|issued\\s*on|발급일)\\s*[::]?\\s*([0-9./\\- ]{8,20}|\\d{1,2}\\s+[A-Z]{3}\\s+\\d{4})" + ); + + private static final Pattern EXPIRE_DATE_PATTERN = Pattern.compile( + "(?i)(final\\s*entry\\s*date|expiry\\s*date|expiration\\s*date|valid\\s*until|until|만료일)\\s*[::]?\\s*([0-9./\\- ]{8,20}|\\d{1,2}\\s+[A-Z]{3}\\s+\\d{4})" + ); + + public VisaInfo parse(String rawText) { + String text = normalize(rawText); + + VisaType visaType = extractVisaType(text); + LocalDate visaStartDate = extractVisaStartDate(text); + LocalDate visaExpiredAt = extractVisaExpiredAt(text); + + List allDates = extractAllDates(text); + + if (visaStartDate == null) { + visaStartDate = inferStartDate(allDates, visaExpiredAt); + } + + if (visaExpiredAt == null) { + visaExpiredAt = inferExpireDate(allDates, visaStartDate); + } + + if (visaExpiredAt == null) { + visaExpiredAt = extractExpireDateFromMrz(text); + } + + return new VisaInfo(visaType, visaStartDate, visaExpiredAt); + } + + private String normalize(String text) { + return text == null ? "" : text.replaceAll("\\s+", " ").trim(); + } + + private VisaType extractVisaType(String text) { + Matcher matcher = STATUS_PATTERN.matcher(text); + if (matcher.find()) { + VisaType visaType = VisaType.from(matcher.group(2)); + if (visaType != null) { + return visaType; + } + } + + Pattern fallbackPattern = Pattern.compile("\\b([A-Z]\\s*-?\\s*\\d{1,2})\\b"); + Matcher fallbackMatcher = fallbackPattern.matcher(text); + + while (fallbackMatcher.find()) { + VisaType visaType = VisaType.from(fallbackMatcher.group(1)); + if (visaType != null) { + return visaType; + } + } + + return null; + } + + private LocalDate extractVisaStartDate(String text) { + Matcher matcher = START_DATE_PATTERN.matcher(text); + if (matcher.find()) { + return parseDate(matcher.group(2)); + } + return null; + } + + private LocalDate extractVisaExpiredAt(String text) { + Matcher matcher = EXPIRE_DATE_PATTERN.matcher(text); + if (matcher.find()) { + return parseDate(matcher.group(2)); + } + return null; + } + + private List extractAllDates(String text) { + List dates = new ArrayList<>(); + Matcher matcher = DATE_PATTERN.matcher(text); + + while (matcher.find()) { + LocalDate parsed = parseDate(matcher.group(1)); + if (parsed != null) { + dates.add(parsed); + } + } + + return dates; + } + + private LocalDate inferStartDate(List dates, LocalDate expiredAt) { + if (dates.isEmpty()) { + return null; + } + + if (expiredAt != null) { + return dates.stream() + .filter(date -> !date.isAfter(expiredAt)) + .min(LocalDate::compareTo) + .orElse(null); + } + + return dates.stream() + .min(LocalDate::compareTo) + .orElse(null); + } + + private LocalDate inferExpireDate(List dates, LocalDate startDate) { + if (dates.isEmpty()) { + return null; + } + + if (startDate != null) { + return dates.stream() + .filter(date -> !date.isBefore(startDate)) + .max(LocalDate::compareTo) + .orElse(null); + } + + return dates.stream() + .max(LocalDate::compareTo) + .orElse(null); + } + + private LocalDate extractExpireDateFromMrz(String text) { + Pattern mrzPattern = Pattern.compile("[MF<](\\d{6})"); + Matcher matcher = mrzPattern.matcher(text); + + while (matcher.find()) { + LocalDate parsed = parseYYMMDD(matcher.group(1)); + if (parsed != null) { + return parsed; + } + } + + return null; + } + + private LocalDate parseDate(String raw) { + if (raw == null || raw.isBlank()) { + return null; + } + + String value = raw.trim().toUpperCase(Locale.ROOT).replaceAll("\\s+", " "); + + DateTimeFormatter[] formatters = new DateTimeFormatter[]{ + DateTimeFormatter.ofPattern("yyyy-MM-dd"), + DateTimeFormatter.ofPattern("yyyy.MM.dd"), + DateTimeFormatter.ofPattern("yyyy/MM/dd"), + DateTimeFormatter.ofPattern("dd-MM-yyyy"), + DateTimeFormatter.ofPattern("dd.MM.yyyy"), + DateTimeFormatter.ofPattern("dd/MM/yyyy"), + DateTimeFormatter.ofPattern("d MMM yyyy", Locale.ENGLISH), + DateTimeFormatter.ofPattern("dd MMM yyyy", Locale.ENGLISH) + }; + + for (DateTimeFormatter formatter : formatters) { + try { + return LocalDate.parse(value, formatter); + } catch (Exception ignored) { + } + } + + return null; + } + + private LocalDate parseYYMMDD(String value) { + try { + int year = Integer.parseInt(value.substring(0, 2)); + int month = Integer.parseInt(value.substring(2, 4)); + int day = Integer.parseInt(value.substring(4, 6)); + + year += (year >= 50 ? 1900 : 2000); + + return LocalDate.of(year, month, day); + } catch (Exception e) { + return null; + } + } + + public record VisaInfo( + VisaType visaType, + LocalDate visaStartDate, + LocalDate visaExpiredAt + ) { + } +} From 121f29286163610ef409fe95cce7c15529c1cfb2 Mon Sep 17 00:00:00 2001 From: dlwjddus1112 Date: Thu, 26 Mar 2026 15:21:17 +0900 Subject: [PATCH 08/20] =?UTF-8?q?[Feat]=20#204=20OCR=EC=9D=84=20=ED=86=B5?= =?UTF-8?q?=ED=95=9C=20=EB=B9=84=EC=9E=90=20=EC=A0=95=EB=B3=B4=20=EC=B6=94?= =?UTF-8?q?=EC=B6=9C=20=EB=A1=9C=EC=A7=81=20=EA=B5=AC=ED=98=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../member/controller/MemberController.java | 18 +++++++++-- .../member/dto/response/OcrVisaResponse.java | 29 ++++++++++++++++++ .../domain/member/service/MemberService.java | 30 +++++++++++++++---- 3 files changed, 69 insertions(+), 8 deletions(-) create mode 100644 src/main/java/org/sopt/kareer/domain/member/dto/response/OcrVisaResponse.java diff --git a/src/main/java/org/sopt/kareer/domain/member/controller/MemberController.java b/src/main/java/org/sopt/kareer/domain/member/controller/MemberController.java index c6eb2b9..d195f20 100644 --- a/src/main/java/org/sopt/kareer/domain/member/controller/MemberController.java +++ b/src/main/java/org/sopt/kareer/domain/member/controller/MemberController.java @@ -1,8 +1,6 @@ package org.sopt.kareer.domain.member.controller; -import static org.sopt.kareer.global.config.swagger.SwaggerResponseDescription.*; - import io.swagger.v3.oas.annotations.Operation; import io.swagger.v3.oas.annotations.tags.Tag; import jakarta.servlet.http.HttpServletRequest; @@ -12,7 +10,9 @@ import org.sopt.kareer.domain.member.dto.request.MemberOnboardRequest; import org.sopt.kareer.domain.member.dto.request.MypageRequest; import org.sopt.kareer.domain.member.dto.response.*; -import org.sopt.kareer.domain.member.entity.constants.*; +import org.sopt.kareer.domain.member.entity.constants.Field; +import org.sopt.kareer.domain.member.entity.constants.Major; +import org.sopt.kareer.domain.member.entity.constants.University; import org.sopt.kareer.domain.member.entity.enums.Country; import org.sopt.kareer.domain.member.service.MemberService; import org.sopt.kareer.domain.roadmap.dto.response.RoadmapTestResponse; @@ -23,9 +23,13 @@ import org.sopt.kareer.global.config.swagger.SwaggerResponseDescription; import org.sopt.kareer.global.response.BaseResponse; import org.springframework.http.HttpStatus; +import org.springframework.http.MediaType; import org.springframework.http.ResponseEntity; import org.springframework.security.core.annotation.AuthenticationPrincipal; import org.springframework.web.bind.annotation.*; +import org.springframework.web.multipart.MultipartFile; + +import static org.sopt.kareer.global.config.swagger.SwaggerResponseDescription.*; @RestController @RequiredArgsConstructor @@ -155,4 +159,12 @@ public ResponseEntity> deleteMember(@AuthenticationPrincipal .body(BaseResponse.ok("회원 탈퇴에 성공하였습니다.")); } + @Operation(summary = "온보딩 비자 OCR API", description = "온보딩 과정에서 유저의 비자 문서를 분석하여 정보를 추출합니다.") + @PostMapping(value = "/onboard/ocr/visa", consumes = MediaType.MULTIPART_FORM_DATA_VALUE) + public ResponseEntity> getVisaInfo( + @RequestPart("file")MultipartFile file){ + return ResponseEntity.status(HttpStatus.OK) + .body(BaseResponse.ok(memberService.getVisaOcr(file), "사용자 비자 정보 추출에 성공했습니다.")); + } + } diff --git a/src/main/java/org/sopt/kareer/domain/member/dto/response/OcrVisaResponse.java b/src/main/java/org/sopt/kareer/domain/member/dto/response/OcrVisaResponse.java new file mode 100644 index 0000000..b237fdb --- /dev/null +++ b/src/main/java/org/sopt/kareer/domain/member/dto/response/OcrVisaResponse.java @@ -0,0 +1,29 @@ +package org.sopt.kareer.domain.member.dto.response; + +import io.swagger.v3.oas.annotations.media.Schema; +import lombok.Builder; +import org.sopt.kareer.domain.member.entity.enums.VisaType; +import org.sopt.kareer.domain.member.util.VisaOcrParser; + +import java.time.LocalDate; + +@Builder +public record OcrVisaResponse( + @Schema(description = "비자 유형") + VisaType visaType, + + @Schema(description = "비자 발급일") + LocalDate visaStartDate, + + @Schema(description = "비자 만료일") + LocalDate visaExpiredAt +){ + public static OcrVisaResponse from(VisaOcrParser.VisaInfo visaInfo) { + return OcrVisaResponse.builder() + .visaType(visaInfo.visaType()) + .visaStartDate(visaInfo.visaStartDate()) + .visaExpiredAt(visaInfo.visaExpiredAt()) + .build(); + } +} + diff --git a/src/main/java/org/sopt/kareer/domain/member/service/MemberService.java b/src/main/java/org/sopt/kareer/domain/member/service/MemberService.java index 69d6405..34628bc 100644 --- a/src/main/java/org/sopt/kareer/domain/member/service/MemberService.java +++ b/src/main/java/org/sopt/kareer/domain/member/service/MemberService.java @@ -1,19 +1,29 @@ package org.sopt.kareer.domain.member.service; import lombok.RequiredArgsConstructor; -import org.sopt.kareer.domain.member.dto.request.*; -import org.sopt.kareer.domain.member.dto.response.*; -import org.sopt.kareer.domain.member.entity.*; +import org.sopt.kareer.domain.member.dto.request.MemberOnboardRequest; +import org.sopt.kareer.domain.member.dto.request.MemberOnboardV2Request; +import org.sopt.kareer.domain.member.dto.response.MemberInfoResponse; +import org.sopt.kareer.domain.member.dto.response.MemberStatusResponse; +import org.sopt.kareer.domain.member.dto.response.MypageResponse; +import org.sopt.kareer.domain.member.dto.response.OcrVisaResponse; +import org.sopt.kareer.domain.member.entity.Member; +import org.sopt.kareer.domain.member.entity.MemberVisa; import org.sopt.kareer.domain.member.entity.enums.MemberStatus; -import org.sopt.kareer.domain.member.exception.*; -import org.sopt.kareer.domain.member.repository.*; +import org.sopt.kareer.domain.member.exception.MemberErrorCode; +import org.sopt.kareer.domain.member.exception.MemberException; +import org.sopt.kareer.domain.member.repository.MemberRepository; +import org.sopt.kareer.domain.member.repository.MemberVisaRepository; import org.sopt.kareer.domain.member.service.dto.request.MypageCommand; +import org.sopt.kareer.domain.member.util.VisaOcrParser; +import org.sopt.kareer.global.document.service.DocumentProcessingService; import org.sopt.kareer.global.exception.customexception.GlobalException; import org.sopt.kareer.global.exception.errorcode.GlobalErrorCode; import org.sopt.kareer.global.oauth.dto.OAuthAttributes; import org.springframework.dao.DataIntegrityViolationException; import org.springframework.stereotype.Service; import org.springframework.transaction.annotation.Transactional; +import org.springframework.web.multipart.MultipartFile; @Service @RequiredArgsConstructor @@ -23,6 +33,8 @@ public class MemberService { private final MemberRepository memberRepository; private final MemberVisaRepository memberVisaRepository; private final MemberDeletionService memberDeletionService; + private final DocumentProcessingService documentProcessingService; + private final VisaOcrParser visaOcrParser; public Member getById(Long memberId) { return memberRepository.findById(memberId) @@ -166,4 +178,12 @@ public void deleteMember(Long memberId) { Member member = getById(memberId); memberDeletionService.deleteMember(member); } + + + public OcrVisaResponse getVisaOcr(MultipartFile file) { + String text = documentProcessingService.extractText(file); + VisaOcrParser.VisaInfo visaInfo = visaOcrParser.parse(text); + + return OcrVisaResponse.from(visaInfo); + } } From b9ad8e68d00ade7c17370ff5cde37c5f84090e0b Mon Sep 17 00:00:00 2001 From: dlwjddus1112 Date: Thu, 26 Mar 2026 15:31:35 +0900 Subject: [PATCH 09/20] =?UTF-8?q?[Fix]=20#204=20=EB=82=A0=EC=A7=9C=20?= =?UTF-8?q?=ED=8C=8C=EC=8B=B1=20=EC=98=A4=EB=A5=98=20=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 생년월일을 비자 발급일로 파싱하던 오류를 수정함 --- .../domain/member/util/VisaOcrParser.java | 58 +++++++------------ 1 file changed, 21 insertions(+), 37 deletions(-) diff --git a/src/main/java/org/sopt/kareer/domain/member/util/VisaOcrParser.java b/src/main/java/org/sopt/kareer/domain/member/util/VisaOcrParser.java index b5fdbcf..934015b 100644 --- a/src/main/java/org/sopt/kareer/domain/member/util/VisaOcrParser.java +++ b/src/main/java/org/sopt/kareer/domain/member/util/VisaOcrParser.java @@ -7,6 +7,7 @@ import java.time.LocalDate; import java.time.format.DateTimeFormatter; import java.util.ArrayList; +import java.util.Comparator; import java.util.List; import java.util.Locale; import java.util.regex.Matcher; @@ -16,32 +17,29 @@ @Component public class VisaOcrParser { + private static final String DATE_REGEX = + "(\\d{4}[./-]\\d{1,2}[./-]\\d{1,2}|\\d{1,2}[./-]\\d{1,2}[./-]\\d{4}|\\d{1,2}\\s+[A-Z]{3}\\s+\\d{4})"; + private static final Pattern DATE_PATTERN = Pattern.compile( - "\\b(" + - "\\d{4}[./-]\\d{1,2}[./-]\\d{1,2}" + - "|" + - "\\d{1,2}[./-]\\d{1,2}[./-]\\d{4}" + - "|" + - "\\d{1,2}\\s+[A-Z]{3}\\s+\\d{4}" + - ")\\b", + "\\b" + DATE_REGEX + "\\b", Pattern.CASE_INSENSITIVE ); - // 비자유형 파싱용 - private static final Pattern STATUS_PATTERN = Pattern.compile( - "(?i)(status|체류자격)\\s*[::]?\\s*([A-Z]\\s*-?\\s*\\d{1,2})" + private static final Pattern SUPPORTED_VISA_PATTERN = Pattern.compile( + "(?i)\\b(D\\s*-?\\s*2|D\\s*-?\\s*10|E\\s*-?\\s*7)\\b" ); private static final Pattern START_DATE_PATTERN = Pattern.compile( - "(?i)(issue\\s*date|date\\s*of\\s*issue|grant\\s*date|issued\\s*on|발급일)\\s*[::]?\\s*([0-9./\\- ]{8,20}|\\d{1,2}\\s+[A-Z]{3}\\s+\\d{4})" + "(?i)(issue\\s*date|date\\s*of\\s*issue|grant\\s*date|발급일)\\s*[/|:]?\\s*" + DATE_REGEX ); private static final Pattern EXPIRE_DATE_PATTERN = Pattern.compile( - "(?i)(final\\s*entry\\s*date|expiry\\s*date|expiration\\s*date|valid\\s*until|until|만료일)\\s*[::]?\\s*([0-9./\\- ]{8,20}|\\d{1,2}\\s+[A-Z]{3}\\s+\\d{4})" + "(?i)(final\\s*entry\\s*date|expiry\\s*date|expiration\\s*date|valid\\s*until|만료일|입국만료일)\\s*[/|:]?\\s*" + DATE_REGEX ); public VisaInfo parse(String rawText) { String text = normalize(rawText); + log.info("rawText: {}", text); VisaType visaType = extractVisaType(text); LocalDate visaStartDate = extractVisaStartDate(text); @@ -49,10 +47,6 @@ public VisaInfo parse(String rawText) { List allDates = extractAllDates(text); - if (visaStartDate == null) { - visaStartDate = inferStartDate(allDates, visaExpiredAt); - } - if (visaExpiredAt == null) { visaExpiredAt = inferExpireDate(allDates, visaStartDate); } @@ -61,6 +55,10 @@ public VisaInfo parse(String rawText) { visaExpiredAt = extractExpireDateFromMrz(text); } + if (visaStartDate == null) { + visaStartDate = inferStartDate(allDates, visaExpiredAt); + } + return new VisaInfo(visaType, visaStartDate, visaExpiredAt); } @@ -69,24 +67,10 @@ private String normalize(String text) { } private VisaType extractVisaType(String text) { - Matcher matcher = STATUS_PATTERN.matcher(text); + Matcher matcher = SUPPORTED_VISA_PATTERN.matcher(text); if (matcher.find()) { - VisaType visaType = VisaType.from(matcher.group(2)); - if (visaType != null) { - return visaType; - } - } - - Pattern fallbackPattern = Pattern.compile("\\b([A-Z]\\s*-?\\s*\\d{1,2})\\b"); - Matcher fallbackMatcher = fallbackPattern.matcher(text); - - while (fallbackMatcher.find()) { - VisaType visaType = VisaType.from(fallbackMatcher.group(1)); - if (visaType != null) { - return visaType; - } + return VisaType.from(matcher.group(1)); } - return null; } @@ -127,8 +111,8 @@ private LocalDate inferStartDate(List dates, LocalDate expiredAt) { if (expiredAt != null) { return dates.stream() - .filter(date -> !date.isAfter(expiredAt)) - .min(LocalDate::compareTo) + .filter(date -> date.isBefore(expiredAt)) + .max(LocalDate::compareTo) .orElse(null); } @@ -144,13 +128,13 @@ private LocalDate inferExpireDate(List dates, LocalDate startDate) { if (startDate != null) { return dates.stream() - .filter(date -> !date.isBefore(startDate)) + .filter(date -> date.isAfter(startDate)) .max(LocalDate::compareTo) .orElse(null); } return dates.stream() - .max(LocalDate::compareTo) + .max(Comparator.naturalOrder()) .orElse(null); } @@ -216,4 +200,4 @@ public record VisaInfo( LocalDate visaExpiredAt ) { } -} +} \ No newline at end of file From 3e2a3c5612c85253097ddf3dcf762d6021f957ca Mon Sep 17 00:00:00 2001 From: dlwjddus1112 Date: Thu, 26 Mar 2026 19:33:22 +0900 Subject: [PATCH 10/20] =?UTF-8?q?[Refactor]=20#204=20OCR=20=EB=A9=94?= =?UTF-8?q?=EC=84=9C=EB=93=9C=20=EB=AA=A8=EB=93=A0=20=EC=9D=B4=EB=AF=B8?= =?UTF-8?q?=EC=A7=80=20=EC=9C=A0=ED=98=95=EC=97=90=20=EC=A0=81=EC=9A=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 기존에 JPG밖에 지원을 안 했어서 Naver Clova OCR API가 지원하는 JPG, JPEG, PNG에 대해 적용될 수 있도록 수정함. - OCR 수행 중 내부 로직에서 예외 발생 시 이를 응답으로 명확하게 내려주도록 함 --- .../clova/service/ClovaOcrService.java | 101 ++++++++++++++++-- 1 file changed, 91 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/sopt/kareer/global/external/clova/service/ClovaOcrService.java b/src/main/java/org/sopt/kareer/global/external/clova/service/ClovaOcrService.java index 0d09567..2997d8a 100644 --- a/src/main/java/org/sopt/kareer/global/external/clova/service/ClovaOcrService.java +++ b/src/main/java/org/sopt/kareer/global/external/clova/service/ClovaOcrService.java @@ -1,16 +1,20 @@ package org.sopt.kareer.global.external.clova.service; import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; import org.sopt.kareer.global.external.clova.dto.request.ClovaOcrRequest; import org.sopt.kareer.global.external.clova.dto.response.ClovaOcrResponse; import org.sopt.kareer.global.external.clova.exception.ClovaErrorCode; import org.sopt.kareer.global.external.clova.exception.ClovaException; import org.springframework.beans.factory.annotation.Value; +import org.springframework.http.HttpStatusCode; import org.springframework.http.MediaType; import org.springframework.stereotype.Service; import org.springframework.web.reactive.function.client.WebClient; +import reactor.core.publisher.Mono; import javax.imageio.ImageIO; +import java.awt.*; import java.awt.image.BufferedImage; import java.io.ByteArrayOutputStream; import java.time.Duration; @@ -19,10 +23,13 @@ import java.util.UUID; import java.util.stream.Collectors; +@Slf4j @Service @RequiredArgsConstructor public class ClovaOcrService { + private static final String OCR_IMAGE_FORMAT = "png"; + private final WebClient clovaOcrWebClient; @Value("${spring.clova.ocr.timeout-ms:15000}") @@ -30,35 +37,66 @@ public class ClovaOcrService { public String doOcr(BufferedImage image) { try { - String base64 = Base64.getEncoder().encodeToString(toJpgBytes(image)); + byte[] imageBytes = toOcrBytes(image); + String base64 = Base64.getEncoder().encodeToString(imageBytes); + + log.info("CLOVA OCR request image size: {} bytes", imageBytes.length); ClovaOcrRequest body = new ClovaOcrRequest( "V2", UUID.randomUUID().toString(), System.currentTimeMillis(), - List.of(new ClovaOcrRequest.Image("jpg", "page", base64)) + List.of(new ClovaOcrRequest.Image(OCR_IMAGE_FORMAT, "page", base64)) ); ClovaOcrResponse response = clovaOcrWebClient.post() .uri("") .contentType(MediaType.APPLICATION_JSON) .bodyValue(body) - .retrieve() - .bodyToMono(ClovaOcrResponse.class) + .exchangeToMono(clientResponse -> { + HttpStatusCode status = clientResponse.statusCode(); + + if (status.is2xxSuccessful()) { + return clientResponse.bodyToMono(ClovaOcrResponse.class); + } + + return clientResponse.bodyToMono(String.class) + .defaultIfEmpty("") + .flatMap(errorBody -> { + log.error("CLOVA OCR error response. status={}, body={}", + status.value(), errorBody); + + return Mono.error( + new ClovaException( + ClovaErrorCode.EXTRACT_IMAGE_FAILED, + "CLOVA OCR error. status=" + status.value() + ", body=" + errorBody + ) + ); + }); + }) .block(Duration.ofMillis(timeoutMs)); - if (response == null || response.images() == null || response.images().isEmpty()) return ""; + if (response == null || response.images() == null || response.images().isEmpty()) { + log.warn("CLOVA OCR response is empty"); + return ""; + } var fields = response.images().get(0).fields(); - if (fields == null || fields.isEmpty()) return ""; + if (fields == null || fields.isEmpty()) { + log.warn("CLOVA OCR fields are empty"); + return ""; + } return fields.stream() .map(ClovaOcrResponse.Field::inferText) - .filter(s -> s != null && !s.isBlank()) + .filter(text -> text != null && !text.isBlank()) .map(String::trim) .collect(Collectors.joining(" ")); + } catch (ClovaException e) { + throw e; } catch (Exception e) { + log.error("CLOVA OCR failed", e); throw new ClovaException( ClovaErrorCode.EXTRACT_IMAGE_FAILED, "CLOVA OCR failed: " + e.getMessage() @@ -66,12 +104,55 @@ public String doOcr(BufferedImage image) { } } - private byte[] toJpgBytes(BufferedImage image) { + private byte[] toOcrBytes(BufferedImage image) { try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { - ImageIO.write(image, "jpg", baos); + BufferedImage normalized = normalizeImage(image); + + boolean success = ImageIO.write(normalized, OCR_IMAGE_FORMAT, baos); + if (!success || baos.size() == 0) { + throw new ClovaException( + ClovaErrorCode.EXTRACT_IMAGE_FAILED, + "Image encoding failed" + ); + } + return baos.toByteArray(); + } catch (ClovaException e) { + throw e; } catch (Exception e) { - throw new ClovaException(ClovaErrorCode.EXTRACT_IMAGE_FAILED, "Image encoding failed: " + e.getMessage()); + throw new ClovaException( + ClovaErrorCode.EXTRACT_IMAGE_FAILED, + "Image encoding failed: " + e.getMessage() + ); } } + + /** + * 모든 입력 이미지를 OCR 전송용 표준 RGB 이미지로 정규화한다. + */ + private BufferedImage normalizeImage(BufferedImage source) { + if (source == null) { + throw new ClovaException( + ClovaErrorCode.EXTRACT_IMAGE_FAILED, + "Image is null" + ); + } + + BufferedImage target = new BufferedImage( + source.getWidth(), + source.getHeight(), + BufferedImage.TYPE_INT_RGB + ); + + Graphics2D g = target.createGraphics(); + try { + g.setColor(Color.WHITE); + g.fillRect(0, 0, target.getWidth(), target.getHeight()); + g.drawImage(source, 0, 0, null); + } finally { + g.dispose(); + } + + return target; + } } \ No newline at end of file From 126562cd509ec4e4b54f8ec361d812353bc6cda8 Mon Sep 17 00:00:00 2001 From: dlwjddus1112 Date: Thu, 26 Mar 2026 19:51:13 +0900 Subject: [PATCH 11/20] =?UTF-8?q?[Feat]=20#204=20=ED=85=8D=EC=8A=A4?= =?UTF-8?q?=ED=8A=B8=20=ED=8C=8C=EC=8B=B1=20=EC=8B=9C=EC=97=90=20=EC=82=AC?= =?UTF-8?q?=EC=9A=A9=EB=90=98=EB=8A=94=20=EC=9C=A0=ED=8B=B8=20=ED=95=A8?= =?UTF-8?q?=EC=88=98=20=ED=81=B4=EB=9E=98=EC=8A=A4=EB=A1=9C=20=EB=B6=84?= =?UTF-8?q?=EB=A6=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../document/util/DocumentDateUtils.java | 56 +++++++++++++++++++ .../document/util/DocumentTextUtils.java | 12 ++++ 2 files changed, 68 insertions(+) create mode 100644 src/main/java/org/sopt/kareer/global/document/util/DocumentDateUtils.java create mode 100644 src/main/java/org/sopt/kareer/global/document/util/DocumentTextUtils.java diff --git a/src/main/java/org/sopt/kareer/global/document/util/DocumentDateUtils.java b/src/main/java/org/sopt/kareer/global/document/util/DocumentDateUtils.java new file mode 100644 index 0000000..69934bf --- /dev/null +++ b/src/main/java/org/sopt/kareer/global/document/util/DocumentDateUtils.java @@ -0,0 +1,56 @@ +package org.sopt.kareer.global.document.util; + +import lombok.AccessLevel; +import lombok.NoArgsConstructor; + +import java.time.LocalDate; +import java.time.format.DateTimeFormatter; +import java.util.List; +import java.util.Locale; + +@NoArgsConstructor(access = AccessLevel.PRIVATE) +public class DocumentDateUtils { + private static final List DEFAULT_DATE_FORMATTERS = List.of( + DateTimeFormatter.ofPattern("yyyy-MM-dd"), + DateTimeFormatter.ofPattern("yyyy.MM.dd"), + DateTimeFormatter.ofPattern("yyyy/MM/dd"), + DateTimeFormatter.ofPattern("dd-MM-yyyy"), + DateTimeFormatter.ofPattern("dd.MM.yyyy"), + DateTimeFormatter.ofPattern("dd/MM/yyyy"), + DateTimeFormatter.ofPattern("d MMM yyyy", Locale.ENGLISH), + DateTimeFormatter.ofPattern("dd MMM yyyy", Locale.ENGLISH) + ); + + public static LocalDate parseDate(String raw) { + if (raw == null || raw.isBlank()) { + return null; + } + + String value = raw.trim() + .toUpperCase(Locale.ROOT) + .replaceAll("\\s+", " "); + + for (DateTimeFormatter formatter : DEFAULT_DATE_FORMATTERS) { + try { + return LocalDate.parse(value, formatter); + } catch (Exception ignored) { + } + } + + return null; + } + + public static LocalDate parseYYMMDD(String value) { + try { + int year = Integer.parseInt(value.substring(0, 2)); + int month = Integer.parseInt(value.substring(2, 4)); + int day = Integer.parseInt(value.substring(4, 6)); + + year += (year >= 50 ? 1900 : 2000); + + return LocalDate.of(year, month, day); + } catch (Exception e) { + return null; + } + } +} diff --git a/src/main/java/org/sopt/kareer/global/document/util/DocumentTextUtils.java b/src/main/java/org/sopt/kareer/global/document/util/DocumentTextUtils.java new file mode 100644 index 0000000..77c940e --- /dev/null +++ b/src/main/java/org/sopt/kareer/global/document/util/DocumentTextUtils.java @@ -0,0 +1,12 @@ +package org.sopt.kareer.global.document.util; + +import lombok.AccessLevel; +import lombok.NoArgsConstructor; + +@NoArgsConstructor(access = AccessLevel.PRIVATE) +public class DocumentTextUtils { + + public static String normalize(String text){ + return text == null ? "" : text.replaceAll("\\s+", " ").trim(); + } +} From 2d9b876215c612e2b0ecd7e810f4966861a3a87f Mon Sep 17 00:00:00 2001 From: dlwjddus1112 Date: Thu, 26 Mar 2026 19:51:49 +0900 Subject: [PATCH 12/20] =?UTF-8?q?[Chore]=20#204=20=EB=AC=B8=EC=84=9C=20?= =?UTF-8?q?=EC=B2=98=EB=A6=AC=20=EC=8B=9C=20=EC=98=88=EC=99=B8=EB=A5=BC=20?= =?UTF-8?q?=ED=95=98=EC=9C=84=20=ED=81=B4=EB=9E=98=EC=8A=A4=EB=A1=9C=20?= =?UTF-8?q?=EC=9C=84=EC=9E=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../service/DocumentProcessingService.java | 53 ++++++++++--------- 1 file changed, 28 insertions(+), 25 deletions(-) diff --git a/src/main/java/org/sopt/kareer/global/document/service/DocumentProcessingService.java b/src/main/java/org/sopt/kareer/global/document/service/DocumentProcessingService.java index 66df032..b8332ad 100644 --- a/src/main/java/org/sopt/kareer/global/document/service/DocumentProcessingService.java +++ b/src/main/java/org/sopt/kareer/global/document/service/DocumentProcessingService.java @@ -27,33 +27,27 @@ public class DocumentProcessingService { private final ClovaOcrService clovaOcrService; - public String extractText(MultipartFile file) { + public String extractText(MultipartFile file) throws IOException { return extractPagesWithOcr(file).stream() .map(PageText::text) .collect(Collectors.joining("\n")); } - public List extractPagesWithOcr(MultipartFile file) { + public List extractPagesWithOcr(MultipartFile file) throws IOException { validate(file); String contentType = file.getContentType(); String filename = file.getOriginalFilename(); - try { - if (isPdf(contentType, filename)) { - return extractPagesFromPdf(file); - } - - if (isImage(contentType)) { - return extractPagesFromImage(file); - } + if (isPdf(contentType, filename)) { + return extractPagesFromPdf(file); + } - throw new DocumentException(DocumentErrorCode.UNSUPPORTED_FILE_TYPE); - } catch (DocumentException e) { - throw e; - } catch (Exception e) { - throw new DocumentException(DocumentErrorCode.EXTRACT_TEXT_FAILED, e.getMessage()); + if (isImage(contentType)) { + return extractPagesFromImage(file); } + + throw new DocumentException(DocumentErrorCode.UNSUPPORTED_FILE_TYPE); } private List extractPagesFromPdf(MultipartFile file) throws IOException { @@ -84,19 +78,28 @@ private List extractPagesFromPdf(MultipartFile file) throws IOExceptio } } - private List extractPagesFromImage(MultipartFile file) throws IOException { - BufferedImage image = ImageIO.read(file.getInputStream()); - if (image == null) { - throw new DocumentException(DocumentErrorCode.INVALID_IMAGE_FILE); - } + private List extractPagesFromImage(MultipartFile file) { + try { + BufferedImage image = ImageIO.read(file.getInputStream()); - String text = sanitizeText(clovaOcrService.doOcr(image)); + if (image == null) { + throw new DocumentException(DocumentErrorCode.INVALID_IMAGE_FILE, "유효하지 않은 이미지 파일입니다."); + } - if (text.isBlank()) { - return List.of(); - } + String text = sanitizeText(clovaOcrService.doOcr(image)); + + if (text.isBlank()) { + return List.of(); + } - return List.of(new PageText(1, text)); + return List.of(new PageText(1, text)); + } catch (DocumentException e) { + throw e; + } catch (IOException e) { + throw new DocumentException(DocumentErrorCode.INVALID_IMAGE_FILE, e.getMessage()); + } catch (Exception e) { + throw new DocumentException(DocumentErrorCode.EXTRACT_IMAGE_FAILED, e.getMessage()); + } } private void validate(MultipartFile file) { From 4460f273c7845abcd33945f7749f6cd70f382a43 Mon Sep 17 00:00:00 2001 From: dlwjddus1112 Date: Thu, 26 Mar 2026 19:52:20 +0900 Subject: [PATCH 13/20] =?UTF-8?q?[Feat]=20#204=20MRZ=20=EA=B7=9C=EA=B2=A9?= =?UTF-8?q?=EC=97=90=20=EB=A7=9E=EB=8A=94=20=EB=82=98=EB=9D=BC=20=EC=9D=B4?= =?UTF-8?q?=EB=A6=84=EC=9C=BC=EB=A1=9C=20=EB=A7=A4=ED=95=91=ED=95=98?= =?UTF-8?q?=EB=8A=94=20=ED=81=B4=EB=9E=98=EC=8A=A4=20=EA=B5=AC=ED=98=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../domain/member/util/CountryResolver.java | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 src/main/java/org/sopt/kareer/domain/member/util/CountryResolver.java diff --git a/src/main/java/org/sopt/kareer/domain/member/util/CountryResolver.java b/src/main/java/org/sopt/kareer/domain/member/util/CountryResolver.java new file mode 100644 index 0000000..125417e --- /dev/null +++ b/src/main/java/org/sopt/kareer/domain/member/util/CountryResolver.java @@ -0,0 +1,51 @@ +package org.sopt.kareer.domain.member.util; + +import org.sopt.kareer.domain.member.entity.enums.Country; +import org.springframework.stereotype.Component; + +import java.util.HashMap; +import java.util.Locale; +import java.util.Map; + +@Component +public class CountryResolver { + + private final Map ISO3_MAP; + + public CountryResolver() { + ISO3_MAP = buildIso3Map(); + } + + private Map buildIso3Map() { + Map map = new HashMap<>(); + + for (Country country : Country.values()) { + try { + Locale locale = findLocaleByCountryName(country.getCountryName()); + + if (locale != null) { + String iso3 = locale.getISO3Country(); + map.put(iso3, country); + } + + } catch (Exception ignored) { + } + } + + return map; + } + + private Locale findLocaleByCountryName(String countryName) { + for (Locale locale : Locale.getAvailableLocales()) { + if (countryName.equalsIgnoreCase(locale.getDisplayCountry(Locale.ENGLISH))) { + return locale; + } + } + return null; + } + + public Country resolveIso3(String iso3) { + if (iso3 == null) return null; + return ISO3_MAP.get(iso3.toUpperCase()); + } +} From 7a0f587905f1d13156381d8e0e7a1b6857d8bff6 Mon Sep 17 00:00:00 2001 From: dlwjddus1112 Date: Thu, 26 Mar 2026 19:52:38 +0900 Subject: [PATCH 14/20] =?UTF-8?q?[Feat]=20#204=20=EC=97=AC=EA=B6=8C?= =?UTF-8?q?=EC=97=90=EC=84=9C=20=EC=A0=95=EB=B3=B4=20=EC=B6=94=EC=B6=9C=20?= =?UTF-8?q?=EB=A9=94=EC=84=9C=EB=93=9C=20=EA=B5=AC=ED=98=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../domain/member/util/PassportOcrParser.java | 205 ++++++++++++++++++ .../domain/member/util/VisaOcrParser.java | 60 +---- .../document/exception/DocumentErrorCode.java | 1 + 3 files changed, 213 insertions(+), 53 deletions(-) create mode 100644 src/main/java/org/sopt/kareer/domain/member/util/PassportOcrParser.java diff --git a/src/main/java/org/sopt/kareer/domain/member/util/PassportOcrParser.java b/src/main/java/org/sopt/kareer/domain/member/util/PassportOcrParser.java new file mode 100644 index 0000000..a9f2c3c --- /dev/null +++ b/src/main/java/org/sopt/kareer/domain/member/util/PassportOcrParser.java @@ -0,0 +1,205 @@ +package org.sopt.kareer.domain.member.util; + +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.sopt.kareer.domain.member.entity.enums.Country; +import org.sopt.kareer.global.document.util.DocumentDateUtils; +import org.sopt.kareer.global.document.util.DocumentTextUtils; +import org.springframework.stereotype.Component; + +import java.time.LocalDate; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +@Slf4j +@Component +@RequiredArgsConstructor +public class PassportOcrParser { + + private static final int TD3_LENGTH = 44; + + private final CountryResolver countryResolver; + + public PassportInfo parse(String rawText) { + log.info("passport rawText: {}", rawText); + + String text = DocumentTextUtils.normalize(rawText); + Mrz mrz = extractMrz(text); + + if (mrz == null) { + return new PassportInfo(null, null, null); + } + + log.info("mrz line1={}, line2={}", mrz.line1(), mrz.line2()); + + String fullName = extractName(mrz.line1()); + Country country = extractCountry(mrz.line1(), mrz.line2()); + LocalDate birthDate = extractBirth(mrz.line2()); + + return new PassportInfo(fullName, country, birthDate); + } + + private Mrz extractMrz(String text) { + List candidates = collectCandidates(text); + + String line1 = null; + String line2 = null; + + for (String c : candidates) { + + if (line1 == null && isLine1(c)) { + line1 = normalizeLineLength(c); + continue; + } + + if (line2 == null && isLine2(c)) { + line2 = normalizeLineLength(c); + } + + if (line1 != null && line2 != null) break; + } + + if (line1 == null && line2 == null) return null; + + return new Mrz(line1, line2); + } + + private List collectCandidates(String text) { + List result = new ArrayList<>(); + String upper = text.toUpperCase(Locale.ROOT); + + for (String token : upper.split("\\s+")) { + String cleaned = sanitize(token); + if (cleaned.length() >= 20) result.add(cleaned); + } + + Matcher m = Pattern.compile("[A-Z0-9<]{20,}").matcher(upper); + while (m.find()) { + result.add(sanitize(m.group())); + } + + return result; + } + + private String sanitize(String v) { + return v.replaceAll("[^A-Z0-9<]", ""); + } + + private boolean isLine1(String v) { + if (v == null || v.length() < 20) return false; + + String fitted = normalizeLineLength(v); + + if (fitted.charAt(0) != 'P') return false; + + // 3~5: 국가코드 + if (!substringSafely(fitted, 2, 5).matches("[A-Z]{3}")) return false; + + return fitted.contains("<<"); + } + + private boolean isLine2(String v) { + if (v == null || v.length() < 20) return false; + + String fitted = normalizeLineLength(v); + + // 11~13: 국가코드 + if (!substringSafely(fitted, 10, 13).matches("[A-Z]{3}")) return false; + + // 14~19: 생년월일 + return substringSafely(fitted, 13, 19).matches("\\d{6}"); + } + + private String normalizeLineLength(String v) { + if (v.length() >= TD3_LENGTH) { + return v.substring(0, TD3_LENGTH); + } + return v + "<".repeat(TD3_LENGTH - v.length()); + } + + private String substringSafely(String v, int s, int e) { + if (v.length() < e) return ""; + return v.substring(s, e); + } + + private String extractName(String line1) { + if (line1 == null) return null; + + try { + String fitted = normalizeLineLength(line1); + + // 반드시 P로 시작해야 함 + if (fitted.charAt(0) != 'P') return null; + + // 국가코드 이후부터 이름 + String body = fitted.substring(5); + + String[] parts = body.split("<<", 2); + if (parts.length < 2) return null; + + String surname = normalizeName(parts[0]); + String given = normalizeName(parts[1]); + + String name = (surname + " " + given).trim(); + return name.isBlank() ? null : name; + + } catch (Exception e) { + return null; + } + } + + private Country extractCountry(String line1, String line2) { + try { + if (line1 != null) { + String fitted = normalizeLineLength(line1); + String code = substringSafely(fitted, 2, 5); + Country c = countryResolver.resolveIso3(code); + if (c != null) return c; + } + + if (line2 != null) { + String fitted = normalizeLineLength(line2); + String code = substringSafely(fitted, 10, 13); + return countryResolver.resolveIso3(code); + } + + return null; + + } catch (Exception e) { + return null; + } + } + + private LocalDate extractBirth(String line2) { + try { + if (line2 == null) return null; + + String fitted = normalizeLineLength(line2); + String birth = substringSafely(fitted, 13, 19); + + if (!birth.matches("\\d{6}")) return null; + + return DocumentDateUtils.parseYYMMDD(birth); + + } catch (Exception e) { + return null; + } + } + + private String normalizeName(String v) { + return v.replace("<", " ") + .replaceAll("\\s+", " ") + .trim(); + } + + private record Mrz(String line1, String line2) {} + + public record PassportInfo( + String fullName, + Country country, + LocalDate birthDate + ) {} +} \ No newline at end of file diff --git a/src/main/java/org/sopt/kareer/domain/member/util/VisaOcrParser.java b/src/main/java/org/sopt/kareer/domain/member/util/VisaOcrParser.java index 934015b..d3945c6 100644 --- a/src/main/java/org/sopt/kareer/domain/member/util/VisaOcrParser.java +++ b/src/main/java/org/sopt/kareer/domain/member/util/VisaOcrParser.java @@ -2,14 +2,14 @@ import lombok.extern.slf4j.Slf4j; import org.sopt.kareer.domain.member.entity.enums.VisaType; +import org.sopt.kareer.global.document.util.DocumentDateUtils; +import org.sopt.kareer.global.document.util.DocumentTextUtils; import org.springframework.stereotype.Component; import java.time.LocalDate; -import java.time.format.DateTimeFormatter; import java.util.ArrayList; import java.util.Comparator; import java.util.List; -import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -38,7 +38,7 @@ public class VisaOcrParser { ); public VisaInfo parse(String rawText) { - String text = normalize(rawText); + String text = DocumentTextUtils.normalize(rawText); log.info("rawText: {}", text); VisaType visaType = extractVisaType(text); @@ -62,10 +62,6 @@ public VisaInfo parse(String rawText) { return new VisaInfo(visaType, visaStartDate, visaExpiredAt); } - private String normalize(String text) { - return text == null ? "" : text.replaceAll("\\s+", " ").trim(); - } - private VisaType extractVisaType(String text) { Matcher matcher = SUPPORTED_VISA_PATTERN.matcher(text); if (matcher.find()) { @@ -77,7 +73,7 @@ private VisaType extractVisaType(String text) { private LocalDate extractVisaStartDate(String text) { Matcher matcher = START_DATE_PATTERN.matcher(text); if (matcher.find()) { - return parseDate(matcher.group(2)); + return DocumentDateUtils.parseDate(matcher.group(2)); } return null; } @@ -85,7 +81,7 @@ private LocalDate extractVisaStartDate(String text) { private LocalDate extractVisaExpiredAt(String text) { Matcher matcher = EXPIRE_DATE_PATTERN.matcher(text); if (matcher.find()) { - return parseDate(matcher.group(2)); + return DocumentDateUtils.parseDate(matcher.group(2)); } return null; } @@ -95,7 +91,7 @@ private List extractAllDates(String text) { Matcher matcher = DATE_PATTERN.matcher(text); while (matcher.find()) { - LocalDate parsed = parseDate(matcher.group(1)); + LocalDate parsed = DocumentDateUtils.parseDate(matcher.group(1)); if (parsed != null) { dates.add(parsed); } @@ -143,7 +139,7 @@ private LocalDate extractExpireDateFromMrz(String text) { Matcher matcher = mrzPattern.matcher(text); while (matcher.find()) { - LocalDate parsed = parseYYMMDD(matcher.group(1)); + LocalDate parsed = DocumentDateUtils.parseYYMMDD(matcher.group(1)); if (parsed != null) { return parsed; } @@ -152,48 +148,6 @@ private LocalDate extractExpireDateFromMrz(String text) { return null; } - private LocalDate parseDate(String raw) { - if (raw == null || raw.isBlank()) { - return null; - } - - String value = raw.trim().toUpperCase(Locale.ROOT).replaceAll("\\s+", " "); - - DateTimeFormatter[] formatters = new DateTimeFormatter[]{ - DateTimeFormatter.ofPattern("yyyy-MM-dd"), - DateTimeFormatter.ofPattern("yyyy.MM.dd"), - DateTimeFormatter.ofPattern("yyyy/MM/dd"), - DateTimeFormatter.ofPattern("dd-MM-yyyy"), - DateTimeFormatter.ofPattern("dd.MM.yyyy"), - DateTimeFormatter.ofPattern("dd/MM/yyyy"), - DateTimeFormatter.ofPattern("d MMM yyyy", Locale.ENGLISH), - DateTimeFormatter.ofPattern("dd MMM yyyy", Locale.ENGLISH) - }; - - for (DateTimeFormatter formatter : formatters) { - try { - return LocalDate.parse(value, formatter); - } catch (Exception ignored) { - } - } - - return null; - } - - private LocalDate parseYYMMDD(String value) { - try { - int year = Integer.parseInt(value.substring(0, 2)); - int month = Integer.parseInt(value.substring(2, 4)); - int day = Integer.parseInt(value.substring(4, 6)); - - year += (year >= 50 ? 1900 : 2000); - - return LocalDate.of(year, month, day); - } catch (Exception e) { - return null; - } - } - public record VisaInfo( VisaType visaType, LocalDate visaStartDate, diff --git a/src/main/java/org/sopt/kareer/global/document/exception/DocumentErrorCode.java b/src/main/java/org/sopt/kareer/global/document/exception/DocumentErrorCode.java index 13cfe0c..80bb368 100644 --- a/src/main/java/org/sopt/kareer/global/document/exception/DocumentErrorCode.java +++ b/src/main/java/org/sopt/kareer/global/document/exception/DocumentErrorCode.java @@ -10,6 +10,7 @@ public enum DocumentErrorCode implements ErrorCode { INVALID_IMAGE_FILE(HttpStatus.BAD_REQUEST.value(), "유효하지 않은 이미지 파일입니다."), FILE_EMPTY(HttpStatus.BAD_REQUEST.value(), "파일이 비어있습니다."), UNSUPPORTED_FILE_TYPE(HttpStatus.BAD_REQUEST.value(), "지원하지 않는 파일 형식입니다."), + EXTRACT_IMAGE_FAILED(HttpStatus.INTERNAL_SERVER_ERROR.value(), "이미지 추출에 실패하였습니다."), ; private final int httpStatus; From d83ee731d1fe64f0b54a74a74ba2d05a5df3d4fe Mon Sep 17 00:00:00 2001 From: dlwjddus1112 Date: Thu, 26 Mar 2026 19:52:51 +0900 Subject: [PATCH 15/20] =?UTF-8?q?[Feat]=20#204=20=EC=97=AC=EA=B6=8C?= =?UTF-8?q?=EC=9D=84=20=ED=86=B5=ED=95=B4=20=EC=82=AC=EC=9A=A9=EC=9E=90=20?= =?UTF-8?q?=EC=A0=95=EB=B3=B4=20=EB=82=B4=EB=A0=A4=EC=A3=BC=EB=8A=94=20?= =?UTF-8?q?=EB=A1=9C=EC=A7=81=20=EA=B5=AC=ED=98=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../member/controller/MemberController.java | 13 ++++++++- .../dto/response/OcrPassportResponse.java | 28 +++++++++++++++++++ .../domain/member/service/MemberService.java | 18 ++++++++---- 3 files changed, 53 insertions(+), 6 deletions(-) create mode 100644 src/main/java/org/sopt/kareer/domain/member/dto/response/OcrPassportResponse.java diff --git a/src/main/java/org/sopt/kareer/domain/member/controller/MemberController.java b/src/main/java/org/sopt/kareer/domain/member/controller/MemberController.java index d195f20..0f6d54a 100644 --- a/src/main/java/org/sopt/kareer/domain/member/controller/MemberController.java +++ b/src/main/java/org/sopt/kareer/domain/member/controller/MemberController.java @@ -29,6 +29,8 @@ import org.springframework.web.bind.annotation.*; import org.springframework.web.multipart.MultipartFile; +import java.io.IOException; + import static org.sopt.kareer.global.config.swagger.SwaggerResponseDescription.*; @RestController @@ -162,9 +164,18 @@ public ResponseEntity> deleteMember(@AuthenticationPrincipal @Operation(summary = "온보딩 비자 OCR API", description = "온보딩 과정에서 유저의 비자 문서를 분석하여 정보를 추출합니다.") @PostMapping(value = "/onboard/ocr/visa", consumes = MediaType.MULTIPART_FORM_DATA_VALUE) public ResponseEntity> getVisaInfo( - @RequestPart("file")MultipartFile file){ + @RequestPart("file") MultipartFile file) throws IOException { return ResponseEntity.status(HttpStatus.OK) .body(BaseResponse.ok(memberService.getVisaOcr(file), "사용자 비자 정보 추출에 성공했습니다.")); } + @Operation(summary = "온보딩 여권 OCR API", description = "온보딩 과정에서 유저의 여권을 분석하여 정보를 추출합니다.") + @PostMapping(value = "/onboard/ocr/passport", consumes = MediaType.MULTIPART_FORM_DATA_VALUE) + public ResponseEntity> getPassportInfo( + @RequestPart("file") MultipartFile file) throws IOException { + return ResponseEntity.status(HttpStatus.OK) + .body(BaseResponse.ok(memberService.getPassportOcr(file), "사용자 여권 정보 추출에 성공했습니다.")); + } + + } diff --git a/src/main/java/org/sopt/kareer/domain/member/dto/response/OcrPassportResponse.java b/src/main/java/org/sopt/kareer/domain/member/dto/response/OcrPassportResponse.java new file mode 100644 index 0000000..85063ff --- /dev/null +++ b/src/main/java/org/sopt/kareer/domain/member/dto/response/OcrPassportResponse.java @@ -0,0 +1,28 @@ +package org.sopt.kareer.domain.member.dto.response; + +import io.swagger.v3.oas.annotations.media.Schema; +import lombok.Builder; +import org.sopt.kareer.domain.member.entity.enums.Country; +import org.sopt.kareer.domain.member.util.PassportOcrParser; + +import java.time.LocalDate; + +@Builder +public record OcrPassportResponse( + @Schema(description = "Full Name", example = "Hong Seungwon") + String fullName, + + @Schema(description = "국가", example = "AFGHANISTAN") + Country country, + + @Schema(description = "생년월일") + LocalDate birthDate +) { + public static OcrPassportResponse from(PassportOcrParser.PassportInfo passportInfo) { + return OcrPassportResponse.builder() + .fullName(passportInfo.fullName()) + .country(passportInfo.country()) + .birthDate(passportInfo.birthDate()) + .build(); + } +} diff --git a/src/main/java/org/sopt/kareer/domain/member/service/MemberService.java b/src/main/java/org/sopt/kareer/domain/member/service/MemberService.java index 34628bc..f70d554 100644 --- a/src/main/java/org/sopt/kareer/domain/member/service/MemberService.java +++ b/src/main/java/org/sopt/kareer/domain/member/service/MemberService.java @@ -3,10 +3,7 @@ import lombok.RequiredArgsConstructor; import org.sopt.kareer.domain.member.dto.request.MemberOnboardRequest; import org.sopt.kareer.domain.member.dto.request.MemberOnboardV2Request; -import org.sopt.kareer.domain.member.dto.response.MemberInfoResponse; -import org.sopt.kareer.domain.member.dto.response.MemberStatusResponse; -import org.sopt.kareer.domain.member.dto.response.MypageResponse; -import org.sopt.kareer.domain.member.dto.response.OcrVisaResponse; +import org.sopt.kareer.domain.member.dto.response.*; import org.sopt.kareer.domain.member.entity.Member; import org.sopt.kareer.domain.member.entity.MemberVisa; import org.sopt.kareer.domain.member.entity.enums.MemberStatus; @@ -15,6 +12,7 @@ import org.sopt.kareer.domain.member.repository.MemberRepository; import org.sopt.kareer.domain.member.repository.MemberVisaRepository; import org.sopt.kareer.domain.member.service.dto.request.MypageCommand; +import org.sopt.kareer.domain.member.util.PassportOcrParser; import org.sopt.kareer.domain.member.util.VisaOcrParser; import org.sopt.kareer.global.document.service.DocumentProcessingService; import org.sopt.kareer.global.exception.customexception.GlobalException; @@ -25,6 +23,8 @@ import org.springframework.transaction.annotation.Transactional; import org.springframework.web.multipart.MultipartFile; +import java.io.IOException; + @Service @RequiredArgsConstructor @Transactional(readOnly = true) @@ -35,6 +35,7 @@ public class MemberService { private final MemberDeletionService memberDeletionService; private final DocumentProcessingService documentProcessingService; private final VisaOcrParser visaOcrParser; + private final PassportOcrParser passportOcrParser; public Member getById(Long memberId) { return memberRepository.findById(memberId) @@ -180,10 +181,17 @@ public void deleteMember(Long memberId) { } - public OcrVisaResponse getVisaOcr(MultipartFile file) { + public OcrVisaResponse getVisaOcr(MultipartFile file) throws IOException { String text = documentProcessingService.extractText(file); VisaOcrParser.VisaInfo visaInfo = visaOcrParser.parse(text); return OcrVisaResponse.from(visaInfo); } + + public OcrPassportResponse getPassportOcr(MultipartFile file) throws IOException { + String text = documentProcessingService.extractText(file); + PassportOcrParser.PassportInfo passportInfo = passportOcrParser.parse(text); + + return OcrPassportResponse.from(passportInfo); + } } From 909e6bb797adde3e80c15c9f6f13add5231efa60 Mon Sep 17 00:00:00 2001 From: dlwjddus1112 Date: Thu, 26 Mar 2026 20:18:39 +0900 Subject: [PATCH 16/20] =?UTF-8?q?[Chore]=20#204=20=EB=B6=88=ED=95=84?= =?UTF-8?q?=EC=9A=94=ED=95=9C=20=EB=A1=9C=EA=B7=B8=20=EC=A0=9C=EA=B1=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../org/sopt/kareer/domain/member/util/PassportOcrParser.java | 4 ---- .../org/sopt/kareer/domain/member/util/VisaOcrParser.java | 1 - .../kareer/global/external/clova/service/ClovaOcrService.java | 2 -- 3 files changed, 7 deletions(-) diff --git a/src/main/java/org/sopt/kareer/domain/member/util/PassportOcrParser.java b/src/main/java/org/sopt/kareer/domain/member/util/PassportOcrParser.java index a9f2c3c..9450786 100644 --- a/src/main/java/org/sopt/kareer/domain/member/util/PassportOcrParser.java +++ b/src/main/java/org/sopt/kareer/domain/member/util/PassportOcrParser.java @@ -24,8 +24,6 @@ public class PassportOcrParser { private final CountryResolver countryResolver; public PassportInfo parse(String rawText) { - log.info("passport rawText: {}", rawText); - String text = DocumentTextUtils.normalize(rawText); Mrz mrz = extractMrz(text); @@ -33,8 +31,6 @@ public PassportInfo parse(String rawText) { return new PassportInfo(null, null, null); } - log.info("mrz line1={}, line2={}", mrz.line1(), mrz.line2()); - String fullName = extractName(mrz.line1()); Country country = extractCountry(mrz.line1(), mrz.line2()); LocalDate birthDate = extractBirth(mrz.line2()); diff --git a/src/main/java/org/sopt/kareer/domain/member/util/VisaOcrParser.java b/src/main/java/org/sopt/kareer/domain/member/util/VisaOcrParser.java index d3945c6..3a65f32 100644 --- a/src/main/java/org/sopt/kareer/domain/member/util/VisaOcrParser.java +++ b/src/main/java/org/sopt/kareer/domain/member/util/VisaOcrParser.java @@ -39,7 +39,6 @@ public class VisaOcrParser { public VisaInfo parse(String rawText) { String text = DocumentTextUtils.normalize(rawText); - log.info("rawText: {}", text); VisaType visaType = extractVisaType(text); LocalDate visaStartDate = extractVisaStartDate(text); diff --git a/src/main/java/org/sopt/kareer/global/external/clova/service/ClovaOcrService.java b/src/main/java/org/sopt/kareer/global/external/clova/service/ClovaOcrService.java index 2997d8a..5c6ed84 100644 --- a/src/main/java/org/sopt/kareer/global/external/clova/service/ClovaOcrService.java +++ b/src/main/java/org/sopt/kareer/global/external/clova/service/ClovaOcrService.java @@ -40,8 +40,6 @@ public String doOcr(BufferedImage image) { byte[] imageBytes = toOcrBytes(image); String base64 = Base64.getEncoder().encodeToString(imageBytes); - log.info("CLOVA OCR request image size: {} bytes", imageBytes.length); - ClovaOcrRequest body = new ClovaOcrRequest( "V2", UUID.randomUUID().toString(), From 0e035dcb02702efb0c959a3f9c7fa4ad48c574b7 Mon Sep 17 00:00:00 2001 From: dlwjddus1112 Date: Thu, 26 Mar 2026 20:25:18 +0900 Subject: [PATCH 17/20] =?UTF-8?q?[Refactor]=20#204=20=EC=BB=A8=ED=8A=B8?= =?UTF-8?q?=EB=A1=A4=EB=9F=AC=EC=97=90=EC=84=9C=20=EC=98=88=EC=99=B8=20?= =?UTF-8?q?=EB=8D=98=EC=A7=80=EC=A7=80=20=EC=95=8A=EB=8F=84=EB=A1=9D=20?= =?UTF-8?q?=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../member/controller/MemberController.java | 6 +-- .../domain/member/service/MemberService.java | 40 +++++++++++++------ .../document/exception/DocumentErrorCode.java | 1 + 3 files changed, 31 insertions(+), 16 deletions(-) diff --git a/src/main/java/org/sopt/kareer/domain/member/controller/MemberController.java b/src/main/java/org/sopt/kareer/domain/member/controller/MemberController.java index 0f6d54a..3b70ea5 100644 --- a/src/main/java/org/sopt/kareer/domain/member/controller/MemberController.java +++ b/src/main/java/org/sopt/kareer/domain/member/controller/MemberController.java @@ -29,8 +29,6 @@ import org.springframework.web.bind.annotation.*; import org.springframework.web.multipart.MultipartFile; -import java.io.IOException; - import static org.sopt.kareer.global.config.swagger.SwaggerResponseDescription.*; @RestController @@ -164,7 +162,7 @@ public ResponseEntity> deleteMember(@AuthenticationPrincipal @Operation(summary = "온보딩 비자 OCR API", description = "온보딩 과정에서 유저의 비자 문서를 분석하여 정보를 추출합니다.") @PostMapping(value = "/onboard/ocr/visa", consumes = MediaType.MULTIPART_FORM_DATA_VALUE) public ResponseEntity> getVisaInfo( - @RequestPart("file") MultipartFile file) throws IOException { + @RequestPart("file") MultipartFile file){ return ResponseEntity.status(HttpStatus.OK) .body(BaseResponse.ok(memberService.getVisaOcr(file), "사용자 비자 정보 추출에 성공했습니다.")); } @@ -172,7 +170,7 @@ public ResponseEntity> getVisaInfo( @Operation(summary = "온보딩 여권 OCR API", description = "온보딩 과정에서 유저의 여권을 분석하여 정보를 추출합니다.") @PostMapping(value = "/onboard/ocr/passport", consumes = MediaType.MULTIPART_FORM_DATA_VALUE) public ResponseEntity> getPassportInfo( - @RequestPart("file") MultipartFile file) throws IOException { + @RequestPart("file") MultipartFile file){ return ResponseEntity.status(HttpStatus.OK) .body(BaseResponse.ok(memberService.getPassportOcr(file), "사용자 여권 정보 추출에 성공했습니다.")); } diff --git a/src/main/java/org/sopt/kareer/domain/member/service/MemberService.java b/src/main/java/org/sopt/kareer/domain/member/service/MemberService.java index f70d554..10f27d6 100644 --- a/src/main/java/org/sopt/kareer/domain/member/service/MemberService.java +++ b/src/main/java/org/sopt/kareer/domain/member/service/MemberService.java @@ -14,6 +14,8 @@ import org.sopt.kareer.domain.member.service.dto.request.MypageCommand; import org.sopt.kareer.domain.member.util.PassportOcrParser; import org.sopt.kareer.domain.member.util.VisaOcrParser; +import org.sopt.kareer.global.document.exception.DocumentErrorCode; +import org.sopt.kareer.global.document.exception.DocumentException; import org.sopt.kareer.global.document.service.DocumentProcessingService; import org.sopt.kareer.global.exception.customexception.GlobalException; import org.sopt.kareer.global.exception.errorcode.GlobalErrorCode; @@ -23,8 +25,6 @@ import org.springframework.transaction.annotation.Transactional; import org.springframework.web.multipart.MultipartFile; -import java.io.IOException; - @Service @RequiredArgsConstructor @Transactional(readOnly = true) @@ -181,17 +181,33 @@ public void deleteMember(Long memberId) { } - public OcrVisaResponse getVisaOcr(MultipartFile file) throws IOException { - String text = documentProcessingService.extractText(file); - VisaOcrParser.VisaInfo visaInfo = visaOcrParser.parse(text); - - return OcrVisaResponse.from(visaInfo); + public OcrVisaResponse getVisaOcr(MultipartFile file){ + try { + String text = documentProcessingService.extractText(file); + VisaOcrParser.VisaInfo visaInfo = visaOcrParser.parse(text); + + return OcrVisaResponse.from(visaInfo); + } catch (DocumentException e) { + throw e; + } catch(Exception e) { + throw new DocumentException( + DocumentErrorCode.OCR_PROCESSING_FAILED + ); + } } - public OcrPassportResponse getPassportOcr(MultipartFile file) throws IOException { - String text = documentProcessingService.extractText(file); - PassportOcrParser.PassportInfo passportInfo = passportOcrParser.parse(text); - - return OcrPassportResponse.from(passportInfo); + public OcrPassportResponse getPassportOcr(MultipartFile file) { + try { + String text = documentProcessingService.extractText(file); + PassportOcrParser.PassportInfo passportInfo = passportOcrParser.parse(text); + + return OcrPassportResponse.from(passportInfo); + } catch (DocumentException e) { + throw e; + } catch(Exception e) { + throw new DocumentException( + DocumentErrorCode.OCR_PROCESSING_FAILED + ); + } } } diff --git a/src/main/java/org/sopt/kareer/global/document/exception/DocumentErrorCode.java b/src/main/java/org/sopt/kareer/global/document/exception/DocumentErrorCode.java index 80bb368..f1538a8 100644 --- a/src/main/java/org/sopt/kareer/global/document/exception/DocumentErrorCode.java +++ b/src/main/java/org/sopt/kareer/global/document/exception/DocumentErrorCode.java @@ -11,6 +11,7 @@ public enum DocumentErrorCode implements ErrorCode { FILE_EMPTY(HttpStatus.BAD_REQUEST.value(), "파일이 비어있습니다."), UNSUPPORTED_FILE_TYPE(HttpStatus.BAD_REQUEST.value(), "지원하지 않는 파일 형식입니다."), EXTRACT_IMAGE_FAILED(HttpStatus.INTERNAL_SERVER_ERROR.value(), "이미지 추출에 실패하였습니다."), + OCR_PROCESSING_FAILED(HttpStatus.INTERNAL_SERVER_ERROR.value(), "OCR 처리 중 오류가 발생했습니다."), ; private final int httpStatus; From acc18ce52333d3b42642870be3141702d94fa01a Mon Sep 17 00:00:00 2001 From: dlwjddus1112 Date: Thu, 26 Mar 2026 20:29:34 +0900 Subject: [PATCH 18/20] =?UTF-8?q?[Chore]=20#204=20=EC=83=9D=EB=85=84?= =?UTF-8?q?=EC=9B=94=EC=9D=BC=20=ED=8C=8C=EC=8B=B1=20=EC=8B=9C=20=EB=8C=80?= =?UTF-8?q?=EB=AC=B8=EC=9E=90=20=EB=B3=80=ED=99=98=20=EB=A1=9C=EC=A7=81=20?= =?UTF-8?q?=EC=A0=9C=EA=B1=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../global/document/util/DocumentDateUtils.java | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/sopt/kareer/global/document/util/DocumentDateUtils.java b/src/main/java/org/sopt/kareer/global/document/util/DocumentDateUtils.java index 69934bf..4021da4 100644 --- a/src/main/java/org/sopt/kareer/global/document/util/DocumentDateUtils.java +++ b/src/main/java/org/sopt/kareer/global/document/util/DocumentDateUtils.java @@ -5,6 +5,7 @@ import java.time.LocalDate; import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeFormatterBuilder; import java.util.List; import java.util.Locale; @@ -17,8 +18,14 @@ public class DocumentDateUtils { DateTimeFormatter.ofPattern("dd-MM-yyyy"), DateTimeFormatter.ofPattern("dd.MM.yyyy"), DateTimeFormatter.ofPattern("dd/MM/yyyy"), - DateTimeFormatter.ofPattern("d MMM yyyy", Locale.ENGLISH), - DateTimeFormatter.ofPattern("dd MMM yyyy", Locale.ENGLISH) + new DateTimeFormatterBuilder() + .parseCaseInsensitive() + .appendPattern("d MMM yyyy") + .toFormatter(Locale.ENGLISH), + new DateTimeFormatterBuilder() + .parseCaseInsensitive() + .appendPattern("dd MMM yyyy") + .toFormatter(Locale.ENGLISH) ); public static LocalDate parseDate(String raw) { From a302bcee8a5e13cfd6054de421857c26c71164d5 Mon Sep 17 00:00:00 2001 From: dlwjddus1112 Date: Thu, 26 Mar 2026 20:31:21 +0900 Subject: [PATCH 19/20] =?UTF-8?q?[Refactor]=20#204=20=EC=83=9D=EB=85=84?= =?UTF-8?q?=EC=9B=94=EC=9D=BC=20=EA=B3=84=EC=82=B0=20=EB=A1=9C=EC=A7=81=20?= =?UTF-8?q?=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../domain/member/util/PassportOcrParser.java | 2 +- .../kareer/domain/member/util/VisaOcrParser.java | 2 +- .../global/document/util/DocumentDateUtils.java | 13 +++++++++---- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/sopt/kareer/domain/member/util/PassportOcrParser.java b/src/main/java/org/sopt/kareer/domain/member/util/PassportOcrParser.java index 9450786..b9b4204 100644 --- a/src/main/java/org/sopt/kareer/domain/member/util/PassportOcrParser.java +++ b/src/main/java/org/sopt/kareer/domain/member/util/PassportOcrParser.java @@ -178,7 +178,7 @@ private LocalDate extractBirth(String line2) { if (!birth.matches("\\d{6}")) return null; - return DocumentDateUtils.parseYYMMDD(birth); + return DocumentDateUtils.parseLocalDate(birth); } catch (Exception e) { return null; diff --git a/src/main/java/org/sopt/kareer/domain/member/util/VisaOcrParser.java b/src/main/java/org/sopt/kareer/domain/member/util/VisaOcrParser.java index 3a65f32..ac1a9d9 100644 --- a/src/main/java/org/sopt/kareer/domain/member/util/VisaOcrParser.java +++ b/src/main/java/org/sopt/kareer/domain/member/util/VisaOcrParser.java @@ -138,7 +138,7 @@ private LocalDate extractExpireDateFromMrz(String text) { Matcher matcher = mrzPattern.matcher(text); while (matcher.find()) { - LocalDate parsed = DocumentDateUtils.parseYYMMDD(matcher.group(1)); + LocalDate parsed = DocumentDateUtils.parseLocalDate(matcher.group(1)); if (parsed != null) { return parsed; } diff --git a/src/main/java/org/sopt/kareer/global/document/util/DocumentDateUtils.java b/src/main/java/org/sopt/kareer/global/document/util/DocumentDateUtils.java index 4021da4..945de24 100644 --- a/src/main/java/org/sopt/kareer/global/document/util/DocumentDateUtils.java +++ b/src/main/java/org/sopt/kareer/global/document/util/DocumentDateUtils.java @@ -47,15 +47,20 @@ public static LocalDate parseDate(String raw) { return null; } - public static LocalDate parseYYMMDD(String value) { + public static LocalDate parseLocalDate(String value) { try { - int year = Integer.parseInt(value.substring(0, 2)); + int yy = Integer.parseInt(value.substring(0, 2)); int month = Integer.parseInt(value.substring(2, 4)); int day = Integer.parseInt(value.substring(4, 6)); - year += (year >= 50 ? 1900 : 2000); + int year = 2000 + yy; + LocalDate date = LocalDate.of(year, month, day); - return LocalDate.of(year, month, day); + if (date.isAfter(LocalDate.now())) { + date = date.minusYears(100); + } + + return date; } catch (Exception e) { return null; } From a136e09a547a1bbb96008ca6dc7860a175e3713f Mon Sep 17 00:00:00 2001 From: dlwjddus1112 Date: Thu, 26 Mar 2026 20:35:03 +0900 Subject: [PATCH 20/20] =?UTF-8?q?[Refactor]=20#204=20OCR=20=EC=95=88?= =?UTF-8?q?=EC=A0=95=EC=84=B1=EC=9D=84=20=EC=9C=84=ED=95=9C=20=EC=9E=85?= =?UTF-8?q?=EB=A0=A5=20=ED=8F=AC=EB=A7=B7=20=EC=A0=9C=ED=95=9C/=EC=A0=95?= =?UTF-8?q?=EC=B1=85=20=EB=B3=80=EA=B2=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../service/DocumentProcessingService.java | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/sopt/kareer/global/document/service/DocumentProcessingService.java b/src/main/java/org/sopt/kareer/global/document/service/DocumentProcessingService.java index b8332ad..d4c2281 100644 --- a/src/main/java/org/sopt/kareer/global/document/service/DocumentProcessingService.java +++ b/src/main/java/org/sopt/kareer/global/document/service/DocumentProcessingService.java @@ -16,6 +16,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; +import java.util.Locale; import java.util.stream.Collectors; @Service @@ -43,7 +44,7 @@ public List extractPagesWithOcr(MultipartFile file) throws IOException return extractPagesFromPdf(file); } - if (isImage(contentType)) { + if (isImage(contentType, filename)) { return extractPagesFromImage(file); } @@ -113,8 +114,19 @@ private boolean isPdf(String contentType, String filename) { || (filename != null && filename.toLowerCase().endsWith(".pdf")); } - private boolean isImage(String contentType) { - return contentType != null && contentType.startsWith("image/"); + private boolean isImage(String contentType, String filename) { + if (contentType != null && contentType.startsWith("image/")) { + return true; + } + + if (filename == null) { + return false; + } + + String lower = filename.toLowerCase(Locale.ROOT); + return lower.endsWith(".jpg") + || lower.endsWith(".jpeg") + || lower.endsWith(".png"); } private static String sanitizeText(String s) {