From 4189595c4139c9569f8d3f98370eb4dcc2b7768e Mon Sep 17 00:00:00 2001 From: Hari Date: Thu, 19 Mar 2026 01:28:48 +0530 Subject: [PATCH] handled the indeed links and smooth finish --- .../service/GeminiExtractionService.java | 1 - .../service/GmailIntegrationService.java | 7 +- .../service/GmailWebhookService.java | 129 ++++++++++++------ .../service/IngestionService.java | 3 +- .../jobtrackerpro/service/JobService.java | 2 +- .../jobtrackerpro/util/UrlParser.java | 40 ++++-- .../resources/application-prod.properties | 7 + 7 files changed, 128 insertions(+), 61 deletions(-) diff --git a/backend/src/main/java/com/thughari/jobtrackerpro/service/GeminiExtractionService.java b/backend/src/main/java/com/thughari/jobtrackerpro/service/GeminiExtractionService.java index f1769c6..9776987 100644 --- a/backend/src/main/java/com/thughari/jobtrackerpro/service/GeminiExtractionService.java +++ b/backend/src/main/java/com/thughari/jobtrackerpro/service/GeminiExtractionService.java @@ -395,7 +395,6 @@ private String buildBatchPrompt(List items) { - help - privacy - settings - - account management If no job-related link exists: diff --git a/backend/src/main/java/com/thughari/jobtrackerpro/service/GmailIntegrationService.java b/backend/src/main/java/com/thughari/jobtrackerpro/service/GmailIntegrationService.java index 11bbe8e..567bb1b 100644 --- a/backend/src/main/java/com/thughari/jobtrackerpro/service/GmailIntegrationService.java +++ b/backend/src/main/java/com/thughari/jobtrackerpro/service/GmailIntegrationService.java @@ -119,7 +119,7 @@ public void connectAndSetupPush(String authCode, String email) throws Exception userRepository.saveAndFlush(user); - log.info("Gmail Automation enabled with 1 DB transaction for: {}", user.getEmail()); + log.info("User {} successfully connected Gmail. Watch set with label ID: {}", email, labelId); } @Async("taskExecutor") @@ -151,8 +151,6 @@ public void initiateManualSync(String email) { String currentHistoryId = service.users().getProfile("me").execute().getHistoryId().toString(); jobService.finalizeManualSync(email, currentHistoryId); - - log.info("Manual sync finished for {}. Found {} jobs.", email, found); } catch (Exception e) { log.error("Manual sync failed for {}: {}", email, e.getMessage()); } finally { @@ -340,8 +338,6 @@ public void disconnectGmail(String email) { userRepository.saveAndFlush(user); cleanupGoogleResourcesAsync(refreshToken, labelId); - - log.info("User {} disconnected from Gmail. Local state cleared.", email); } @Async("taskExecutor") @@ -358,7 +354,6 @@ protected void cleanupGoogleResourcesAsync(String refreshToken, String labelId) .uri("https://oauth2.googleapis.com/revoke?token=" + refreshToken) .retrieve(); - log.info("Google resources cleaned up and token revoked."); } catch (Exception e) { log.warn("Non-critical: Google resource cleanup failed: {}", e.getMessage()); } diff --git a/backend/src/main/java/com/thughari/jobtrackerpro/service/GmailWebhookService.java b/backend/src/main/java/com/thughari/jobtrackerpro/service/GmailWebhookService.java index 78ac7e5..122f3de 100644 --- a/backend/src/main/java/com/thughari/jobtrackerpro/service/GmailWebhookService.java +++ b/backend/src/main/java/com/thughari/jobtrackerpro/service/GmailWebhookService.java @@ -24,6 +24,8 @@ import java.util.ArrayList; import java.util.Base64; import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; @Service @Slf4j @@ -50,19 +52,14 @@ public GmailWebhookService(GeminiService geminiService, JobService jobService, U @Async("taskExecutor") public void processHistorySync(String userEmail) { final String email = userEmail.toLowerCase(); - + LocalDateTime now = LocalDateTime.now(); LocalDateTime expiryThreshold = now.minusMinutes(15); - int updatedRows = userRepository.claimSyncLock(email, now, expiryThreshold); - if (updatedRows == 0) return; - - cacheEvictService.evictAllForUser(email); + if (userRepository.claimSyncLock(email, now, expiryThreshold) == 0) return; try { - User user = userRepository.findByEmail(email) - .orElseThrow(() -> new RuntimeException("User not found after lock")); - + User user = userRepository.findByEmail(email).orElseThrow(); if (user.getGmailRefreshToken() == null) return; String accessToken = getFreshAccessToken(user.getGmailRefreshToken()); @@ -87,15 +84,12 @@ public void processHistorySync(String userEmail) { List batchItems = collectMessages(service, historyResponse.getHistory()); if (!batchItems.isEmpty()) { - - List extractedJobs = geminiService.extractJobsFromBatch(batchItems); - - log.info("Ingesting batch of {} emails via Gemini for {}", batchItems.size(), email); - - jobService.saveBatchResults(email, batchItems, extractedJobs); - } + log.info("Ingesting batch of {} emails for {}", batchItems.size(), email); + List extractedJobs = geminiService.extractJobsFromBatch(batchItems); + jobService.saveBatchResults(email, batchItems, extractedJobs); + } } catch (Exception e) { - log.error("High-Performance Sync failed for {}: ", email, e); + log.error("Sync failed for {}: ", email, e); } finally { userRepository.releaseSyncLock(email); cacheEvictService.evictAllForUser(email); @@ -110,14 +104,10 @@ private List collectMessages(Gmail service, List histor if (history.getMessagesAdded() == null) continue; for (HistoryMessageAdded added : history.getMessagesAdded()) { try { - Message m = service.users().messages().get("me", added.getMessage().getId()) - .setFormat("full").execute(); - - long millisecondTimestamp = m.getInternalDate(); - LocalDateTime emailDate = LocalDateTime.ofInstant( - Instant.ofEpochMilli(millisecondTimestamp), ZoneOffset.UTC); + Message m = service.users().messages().get("me", added.getMessage().getId()).setFormat("full").execute(); + LocalDateTime emailDate = LocalDateTime.ofInstant(Instant.ofEpochMilli(m.getInternalDate()), ZoneOffset.UTC); - String from = "", subj = "", replyTo=""; + String from = "", subj = "", replyTo = ""; for (var h : m.getPayload().getHeaders()) { if ("From".equalsIgnoreCase(h.getName())) from = h.getValue(); if ("Subject".equalsIgnoreCase(h.getName())) subj = h.getValue(); @@ -125,30 +115,94 @@ private List collectMessages(Gmail service, List histor } if (!isSystemNoise(subj)) { - String body = extractTextFromBody(m.getPayload()); + String body = extractProcessedBody(m.getPayload()); items.add(new EmailBatchItem(from, subj, replyTo, body, emailDate)); } } catch (Exception e) { - log.warn("Failed to fetch message {}: {}", added.getMessage().getId(), e.getMessage()); + log.warn("Failed message fetch: {}", e.getMessage()); } } } return items; } - private String extractTextFromBody(MessagePart part) { + private String extractProcessedBody(MessagePart payload) { + StringBuilder rawBuffer = new StringBuilder(); + recursiveRawCollect(payload, rawBuffer); + + String cleaned = surgicalClean(rawBuffer.toString()); + + return cleaned; + } + + private void recursiveRawCollect(MessagePart part, StringBuilder buffer) { + if (part.getParts() != null) { + for (MessagePart subPart : part.getParts()) recursiveRawCollect(subPart, buffer); + } if (part.getBody() != null && part.getBody().getData() != null) { - String content = new String(Base64.getUrlDecoder().decode(part.getBody().getData())); - if (part.getMimeType().contains("text/plain")) return content; - if (part.getMimeType().contains("text/html")) return content.replaceAll("<[^>]*>", " "); + buffer.append(new String(Base64.getUrlDecoder().decode(part.getBody().getData()))).append("\n"); } - if (part.getParts() != null) { - for (MessagePart subPart : part.getParts()) { - String text = extractTextFromBody(subPart); - if (text != null && !text.isBlank()) return text; + } + + private String surgicalClean(String rawHtml) { + if (rawHtml == null || rawHtml.isBlank()) return ""; + + String content = rawHtml.replaceAll("(?is).*?", "") + .replaceAll("(?is).*?", ""); + + StringBuilder sb = new StringBuilder(); + Matcher m = Pattern.compile("(?is)]*?href\\s*=\\s*[\"']([^\"']+)[\"'][^>]*?>(.*?)").matcher(content); + + int lastEnd = 0; + while (m.find()) { + sb.append(content, lastEnd, m.start()); + + String rawUrl = m.group(1).replace("&", "&"); + String linkText = m.group(2).replaceAll("<[^>]*>", "").trim(); + + String processedUrl = processUrlByDomain(rawUrl); + + boolean isJobLink = processedUrl.contains("viewjob") || processedUrl.contains("confirmemail") || + processedUrl.contains("linkedin.com/jobs") || processedUrl.contains("careers") || + processedUrl.contains("apply"); + + if (isJobLink && processedUrl.length() > 15) { + sb.append(" [LINK_START]").append(linkText).append("[LINK_URL]").append(processedUrl).append("[LINK_END] "); + } else { + sb.append(" ").append(linkText).append(" "); } + + lastEnd = m.end(); + } + sb.append(content.substring(lastEnd)); + + return sb.toString() + .replaceAll("(?i)", "\n") + .replaceAll("(?i)", " ") + .replaceAll("<[^>]*>", " ") + .replaceAll(" ", " ") + .replaceAll("\\s+", " ") + .trim(); + } + + private String processUrlByDomain(String url) { + if (url == null) return ""; + String lowerUrl = url.toLowerCase(); + + if (lowerUrl.contains("linkedin.com/jobs") || lowerUrl.contains("linkedin.com/comm/jobs")) { + int queryIndex = url.indexOf("?"); + return queryIndex > 0 ? url.substring(0, queryIndex) : url; } - return ""; + + if (lowerUrl.contains("indeed.com")) { + return url; + } + + if (url.contains("utm_") || url.contains("ref=")) { + return url.replaceAll("[?&]utm_[^&]+", "").replaceAll("[?&]ref=[^&]+", ""); + } + + return url; } private void bootstrapUserHistory(Gmail service, User user) throws Exception { @@ -164,13 +218,6 @@ private boolean isSystemNoise(String subject) { return s.contains("security alert") || s.contains("sign-in") || s.contains("verification code"); } -// private void evictUserCaches(String email) { -// Cache userCache = cacheManager.getCache("users"); -// Cache entityCache = cacheManager.getCache("userEntities"); -// if (userCache != null) userCache.evict(email); -// if (entityCache != null) entityCache.evict(email); -// } - public String getFreshAccessToken(String refreshToken) throws Exception { return new GoogleRefreshTokenRequest(GoogleNetHttpTransport.newTrustedTransport(), GsonFactory.getDefaultInstance(), refreshToken, clientId, clientSecret).execute().getAccessToken(); diff --git a/backend/src/main/java/com/thughari/jobtrackerpro/service/IngestionService.java b/backend/src/main/java/com/thughari/jobtrackerpro/service/IngestionService.java index aca3524..914b5dc 100644 --- a/backend/src/main/java/com/thughari/jobtrackerpro/service/IngestionService.java +++ b/backend/src/main/java/com/thughari/jobtrackerpro/service/IngestionService.java @@ -30,7 +30,7 @@ public void handleManualForward(String from, String subject, String body, String if (user == null) return; if (Boolean.TRUE.equals(user.getGmailConnected())) { - log.info("Discarding forwarded email for {}: Direct Sync is active.", userEmail); + log.warn("Discarding forwarded email for {}: Direct Sync is active.", userEmail); return; } @@ -39,7 +39,6 @@ public void handleManualForward(String from, String subject, String body, String if (job != null) { jobService.createOrUpdateJob(job, userEmail); - log.info("Successfully ingested forwarded job: {} for {}", job.getCompany(), userEmail); } } } \ No newline at end of file diff --git a/backend/src/main/java/com/thughari/jobtrackerpro/service/JobService.java b/backend/src/main/java/com/thughari/jobtrackerpro/service/JobService.java index 0f0e561..b8be32d 100644 --- a/backend/src/main/java/com/thughari/jobtrackerpro/service/JobService.java +++ b/backend/src/main/java/com/thughari/jobtrackerpro/service/JobService.java @@ -175,7 +175,7 @@ public void saveBatchResults(String email, List batchItems, List List> batchUrlLists = batchItems.parallelStream() .map(item -> UrlParser.extractAndCleanUrls(item.body())) - .toList(); + .toList(); for (JobDTO job : extractedJobs) { Integer idx = job.getInputIndex(); diff --git a/backend/src/main/java/com/thughari/jobtrackerpro/util/UrlParser.java b/backend/src/main/java/com/thughari/jobtrackerpro/util/UrlParser.java index c410add..47583a0 100644 --- a/backend/src/main/java/com/thughari/jobtrackerpro/util/UrlParser.java +++ b/backend/src/main/java/com/thughari/jobtrackerpro/util/UrlParser.java @@ -8,30 +8,50 @@ public class UrlParser { - private static final Pattern URL_PATTERN = Pattern.compile("https?://[a-zA-Z0-9./?=&%_\\-]+"); + private static final Pattern URL_PATTERN = Pattern.compile("(https?://|www\\.)[a-zA-Z0-9./?=&%_\\-+]+(? extractAndCleanUrls(String text) { if (text == null) return List.of(); List urls = new ArrayList<>(); Matcher matcher = URL_PATTERN.matcher(text); while (matcher.find()) { - urls.add(cleanTrackingParams(matcher.group())); + String rawUrl = matcher.group(); + urls.add(processUrlByDomain(rawUrl)); } - return urls.stream().distinct().collect(Collectors.toList()); + return urls.stream() + .filter(url -> !url.isBlank()) + .distinct() + .collect(Collectors.toList()); } - private static String cleanTrackingParams(String url) { - int qIndex = url.indexOf("?"); - return qIndex > 0 ? url.substring(0, qIndex) : url; + + private static String processUrlByDomain(String url) { + String lowerUrl = url.toLowerCase(); + + if (lowerUrl.contains("indeed.com")) { + return url; + } + + if (lowerUrl.contains("linkedin.com") || lowerUrl.contains("utm_") || lowerUrl.contains("ref=") || lowerUrl.contains("trk=")) { + int qIndex = url.indexOf("?"); + return qIndex > 0 ? url.substring(0, qIndex) : url; + } + + return url; } public static String trimNoise(String body) { if (body == null) return ""; - String[] markers = {"View similar jobs", "Unsubscribe", "©", "Help Center", "References"}; + + String cleanBody = body.replaceAll("(?is).*?", "") + .replaceAll("(?is).*?", ""); + + String[] markers = {"View similar jobs", "Unsubscribe", "©", "Help Center", "References", "Privacy Policy"}; for (String marker : markers) { - int index = body.indexOf(marker); - if (index > 0) body = body.substring(0, index); + int index = cleanBody.indexOf(marker); + if (index > 0) cleanBody = cleanBody.substring(0, index); } - return body.length() > 3000 ? body.substring(0, 3000 ) : body; + + return cleanBody.length() > 3000 ? cleanBody.substring(0, 3000) : cleanBody; } } \ No newline at end of file diff --git a/backend/src/main/resources/application-prod.properties b/backend/src/main/resources/application-prod.properties index 7308fb7..639be1b 100644 --- a/backend/src/main/resources/application-prod.properties +++ b/backend/src/main/resources/application-prod.properties @@ -26,6 +26,13 @@ spring.jpa.properties.hibernate.jdbc.batch_size=25 spring.jpa.properties.hibernate.order_inserts=true spring.jpa.properties.hibernate.order_updates=true +# Logging Levels +logging.level.root=WARN +logging.level.org.springframework=WARN +logging.level.org.hibernate=WARN +logging.level.com.thughari.jobtrackerpro=WARN +logging.level.com.thughari.jobtrackerpro.scheduler=INFO + # Gemini AI app.gemini.enabled=true gemini.api.key=${GEMINI_API_KEY}