diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index c67541e..941c487 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -2,57 +2,57 @@ name: Deploy to AKS Cluster on: push: branches: - - master + - master pull_request: branches: - - master + - master jobs: build: runs-on: ubuntu-latest steps: - - uses: actions/checkout@master - - name: Decrypt large secret - run: ./scripts/decrypt.sh - env: - APPLICATION_PROPERTIES_PASSPHRASE: ${{ secrets.APPLICATION_PROPERTIES_PASSPHRASE }} - - - name: Set up JDK 1.8 - uses: actions/setup-java@v1 - with: - java-version: 1.8 - - - name: Include local jar - run: mvn install:install-file -Dfile="lib/PageSuccess-0.0.1-SNAPSHOT.jar" -DgroupId="ca.gc.tbs" -DartifactId="PageSuccess" -Dversion="0.0.1-SNAPSHOT" -Dpackaging=jar -DgeneratePom=true - - - name: Include local jar - run: mvn install:install-file -Dfile="lib/airtable.java-0.2.0.jar" -DgroupId="com.sybit" -DartifactId="airtable.java" -Dversion="0.2.0" -Dpackaging=jar -DgeneratePom=true - - - name: Build with Maven - run: mvn install --file pom.xml - - - uses: Azure/docker-login@v1 - with: - login-server: tbsacr.azurecr.io - username: ${{ secrets.ACR_USERNAME }} - password: ${{ secrets.ACR_PASSWORD }} - - - run: | - docker build -f ./docker/Dockerfile . -t tbsacr.azurecr.io/feedback-cj:${{ github.sha }} - docker push tbsacr.azurecr.io/feedback-cj:${{ github.sha }} - - # Set the target AKS cluster. - - uses: Azure/aks-set-context@v1 - with: - creds: '${{ secrets.AZURE_CREDENTIALS }}' - cluster-name: tbs-prod-aks - resource-group: tbs-prod-rg - - - uses: Azure/k8s-deploy@v1 - with: - manifests: | - kubernetes/feedback-cronjob.yml - images: | - tbsacr.azurecr.io/feedback-cj:${{ github.sha }} - namespace: | - pagesuccess + - uses: actions/checkout@master + - name: Decrypt large secret + run: ./scripts/decrypt.sh + env: + APPLICATION_PROPERTIES_PASSPHRASE: ${{ secrets.APPLICATION_PROPERTIES_PASSPHRASE }} + - name: Set up JDK 17 + uses: actions/setup-java@v3 + with: + java-version: "17" + distribution: "temurin" + + - name: Include local jar + run: mvn install:install-file -Dfile="lib/PageSuccess-0.0.1-SNAPSHOT.jar" -DgroupId="ca.gc.tbs" -DartifactId="PageSuccess" -Dversion="0.0.1-SNAPSHOT" -Dpackaging=jar -DgeneratePom=true + + - name: Include local jar + run: mvn install:install-file -Dfile="lib/airtable.java-0.2.0.jar" -DgroupId="com.sybit" -DartifactId="airtable.java" -Dversion="0.2.0" -Dpackaging=jar -DgeneratePom=true + + - name: Build with Maven + run: mvn install --file pom.xml + + - uses: Azure/docker-login@v1 + with: + login-server: tbsacr.azurecr.io + username: ${{ secrets.ACR_USERNAME }} + password: ${{ secrets.ACR_PASSWORD }} + + - run: | + docker build -f ./docker/Dockerfile . -t tbsacr.azurecr.io/feedback-cj:${{ github.sha }} + docker push tbsacr.azurecr.io/feedback-cj:${{ github.sha }} + + # Set the target AKS cluster. + - uses: Azure/aks-set-context@v1 + with: + creds: "${{ secrets.AZURE_CREDENTIALS }}" + cluster-name: tbs-prod-aks + resource-group: tbs-prod-rg + + - uses: Azure/k8s-deploy@v1 + with: + manifests: | + kubernetes/feedback-cronjob.yml + images: | + tbsacr.azurecr.io/feedback-cj:${{ github.sha }} + namespace: | + pagesuccess diff --git a/.gitignore b/.gitignore index 28f2b49..8af135f 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,10 @@ hs_err_pid* **/StoredCredential .DS_Store +# Google Service Account Keys (plaintext - encrypted versions OK) +**/service-account.json +**/service-account.p12 + pagefeedback-cj.iml feedback-cj.iml Feedback Tool.iml diff --git a/docker/Dockerfile b/docker/Dockerfile index d06101f..4a23180 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,6 +1,4 @@ -FROM maven:3.8.6-openjdk-8-slim -RUN apt-get clean -RUN apt-get update +FROM eclipse-temurin:17-jre-alpine RUN mkdir -p /app COPY target/pagefeedback-cj-1.0.0-SNAPSHOT.jar /app/app.jar ENV JAVA_OPTS="-Xmx2g" diff --git a/lib/PageSuccess-0.0.1-SNAPSHOT.jar b/lib/PageSuccess-0.0.1-SNAPSHOT.jar index f87118f..b3fa8e3 100644 Binary files a/lib/PageSuccess-0.0.1-SNAPSHOT.jar and b/lib/PageSuccess-0.0.1-SNAPSHOT.jar differ diff --git a/pom.xml b/pom.xml index 7489d7a..d24559f 100644 --- a/pom.xml +++ b/pom.xml @@ -10,9 +10,14 @@ org.springframework.boot spring-boot-starter-parent - 2.2.1.RELEASE + 3.2.5 + + 17 + 17 + 17 + @@ -50,36 +55,38 @@ commons-io commons-io - 2.11.0 + 2.15.1 - org.apache.httpcomponents - httpclient - 4.5.13 + org.apache.httpcomponents.client5 + httpclient5 + 5.4 org.json json - 20160810 + 20240303 org.springframework.boot spring-boot-starter - org.slf4j - slf4j-api - 2.0.4 + org.apache.commons + commons-lang3 + 3.14.0 + com.mashape.unirest unirest-java 1.4.9 + - org.apache.commons - commons-lang3 - 3.12.0 + org.glassfish.jaxb + jaxb-runtime + 2.3.9 com.sybit @@ -91,10 +98,17 @@ PageSuccess 0.0.1-SNAPSHOT + com.google.apis google-api-services-sheets - v4-rev20210629-1.32.1 + v4-rev20240826-2.0.0 + + + + com.google.auth + google-auth-library-oauth2-http + 1.24.1 org.springframework.data @@ -108,12 +122,12 @@ org.apache.commons commons-csv - 1.9.0 + 1.11.0 org.jsoup jsoup - 1.15.3 + 1.18.3 uk.gov.service.notify diff --git a/scripts/decrypt.sh b/scripts/decrypt.sh index 7f8a078..18756b9 100755 --- a/scripts/decrypt.sh +++ b/scripts/decrypt.sh @@ -3,8 +3,10 @@ export GPG_TTY=$(tty) ls ./src/main/resources +# Decrypt application properties gpg --quiet --batch --yes --passphrase="$APPLICATION_PROPERTIES_PASSPHRASE" --output ./src/main/resources/application.properties --decrypt ./src/main/resources/application.properties.gpg -gpg --quiet --batch --yes --passphrase="$APPLICATION_PROPERTIES_PASSPHRASE" --output ./src/main/resources/service-account.p12 --decrypt ./src/main/resources/service-account.p12.gpg +# Decrypt Google service account JSON key (modern format) +gpg --quiet --batch --yes --passphrase="$APPLICATION_PROPERTIES_PASSPHRASE" --output ./src/main/resources/service-account.json --decrypt ./src/main/resources/service-account.json.gpg ls ./src/main/resources \ No newline at end of file diff --git a/src/main/java/ca/gc/tbs/AirTableMLTag.java b/src/main/java/ca/gc/tbs/AirTableMLTag.java deleted file mode 100644 index 5e58ef8..0000000 --- a/src/main/java/ca/gc/tbs/AirTableMLTag.java +++ /dev/null @@ -1,34 +0,0 @@ -package ca.gc.tbs; - -import com.google.gson.annotations.SerializedName; - -public class AirTableMLTag { - private String id; - - @SerializedName("ML tags") - private String tag; - - public AirTableMLTag(String tag) { - this.tag = tag; - } - - public AirTableMLTag() { - - } - - public String getId() { - return id; - } - - public void setId(String id) { - this.id = id; - } - - public String getTag() { - return tag; - } - - public void setTag(String tag) { - this.tag = tag; - } -} diff --git a/src/main/java/ca/gc/tbs/AirTableProblem.java b/src/main/java/ca/gc/tbs/AirTableProblem.java deleted file mode 100644 index a59a923..0000000 --- a/src/main/java/ca/gc/tbs/AirTableProblem.java +++ /dev/null @@ -1,137 +0,0 @@ -package ca.gc.tbs; -import com.google.gson.annotations.SerializedName; - -public class AirTableProblem { - private String id; - - @SerializedName("Unique ID") - private String uniqueID; - @SerializedName("Date") - private String date; - @SerializedName("Time received") - private String timeStamp; - @SerializedName("URL") - private String URL; - @SerializedName("Name") - private String URL_link; - @SerializedName("Page title") - private String pageTitle; - @SerializedName("Lang") - private String lang; - @SerializedName("What's wrong") - private String whatswrong; - @SerializedName("Details") - private String details; - @SerializedName("Tags") - private String tags; - @SerializedName("Info exists") - private String infoExists; - @SerializedName("PII") - private String PII; - - @SerializedName("PII Type") - private String PIIType; - - - @SerializedName("Topic - HC") - private String topic; - @SerializedName("Actionable") - private Boolean actionable; - - - - - - public String getId() { - return id; - } - public void setId(String id) { - this.id = id; - } - public String getTimeStamp() { - return timeStamp; - } - public void setTimeStamp(String timeStamp) { - this.timeStamp = timeStamp; - } - public String getDate() { - return date; - } - public void setDate(String date) { - this.date = date; - } - public String getURL() { - return URL; - } - public void setURL(String uRL) { - URL = uRL; - } - public String getPageTitle() { - return pageTitle; - } - public void setPageTitle(String pageTitle) { - this.pageTitle = pageTitle; - } - public String getLang() { - return lang; - } - public void setLang(String lang) { - this.lang = lang; - } - public String getWhatswrong() { - return whatswrong; - } - public void setWhatswrong(String whatswrong) { - this.whatswrong = whatswrong; - } - public String getDetails() { - return details; - } - public void setDetails(String details) { - this.details = details; - } - public String getTags() { - return tags; - } - public void setTags(String tags) { - this.tags = tags; - } - public String getInfoExists() { - return infoExists; - } - public void setInfoExists(String infoExists) { - this.infoExists = infoExists; - } - public String getPII() { - return PII; - } - public void setPII(String pII) { - PII = pII; - } - public String getTopic() { - return topic; - } - public void setTopic(String topic) { - this.topic = topic; - } - public String getURL_link() { - return URL_link; - } - public void setURL_link(String uRL_link) { - URL_link = uRL_link; - } - public String getUniqueID() { - return uniqueID; - } - public void setUniqueID(String uniqueID) { - this.uniqueID = uniqueID; - } - public String getPIIType() { - return PIIType; - } - public void setPIIType(String pIIType) { - PIIType = pIIType; - } - - -} diff --git a/src/main/java/ca/gc/tbs/AirTableStat.java b/src/main/java/ca/gc/tbs/AirTableStat.java deleted file mode 100644 index 8ad986e..0000000 --- a/src/main/java/ca/gc/tbs/AirTableStat.java +++ /dev/null @@ -1,34 +0,0 @@ -package ca.gc.tbs; - -import com.google.gson.annotations.SerializedName; - -public class AirTableStat { - private String id; - - @SerializedName("Page title") - private String pageTitle; - - public AirTableStat(String title) { - this.pageTitle = title; - } - - public AirTableStat() { - - } - - public String getId() { - return id; - } - - public void setId(String id) { - this.id = id; - } - - public String getPageTitle() { - return pageTitle; - } - - public void setPageTitle(String pageTitle) { - this.pageTitle = pageTitle; - } -} diff --git a/src/main/java/ca/gc/tbs/AirTableURLLink.java b/src/main/java/ca/gc/tbs/AirTableURLLink.java deleted file mode 100644 index 19023ba..0000000 --- a/src/main/java/ca/gc/tbs/AirTableURLLink.java +++ /dev/null @@ -1,36 +0,0 @@ -package ca.gc.tbs; - -import com.google.gson.annotations.SerializedName; - -public class AirTableURLLink { - private String id; - - @SerializedName("Name") - private String URLlink; - - - public AirTableURLLink(String urlLink) { - this.URLlink = urlLink; - } - - public AirTableURLLink() { - - } - - public String getId() { - return id; - } - - public void setId(String id) { - this.id = id; - } - public String getURLlink() { - return URLlink; - } - - public void setURLlink(String uRLlink) { - URLlink = uRLlink; - } - - -} diff --git a/src/main/java/ca/gc/tbs/GoogleSheetsAPI.java b/src/main/java/ca/gc/tbs/GoogleSheetsAPI.java index 20c4fd0..a4650ed 100644 --- a/src/main/java/ca/gc/tbs/GoogleSheetsAPI.java +++ b/src/main/java/ca/gc/tbs/GoogleSheetsAPI.java @@ -1,6 +1,5 @@ package ca.gc.tbs; -import com.google.api.client.googleapis.auth.oauth2.GoogleCredential; import com.google.api.client.googleapis.javanet.GoogleNetHttpTransport; import com.google.api.client.http.javanet.NetHttpTransport; import com.google.api.client.json.JsonFactory; @@ -9,98 +8,195 @@ import com.google.api.services.sheets.v4.SheetsScopes; import com.google.api.services.sheets.v4.model.AppendValuesResponse; import com.google.api.services.sheets.v4.model.ValueRange; +import com.google.auth.http.HttpCredentialsAdapter; +import com.google.auth.oauth2.GoogleCredentials; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.IOException; +import java.io.InputStream; import java.security.GeneralSecurityException; -import java.security.KeyStore; -import java.security.PrivateKey; import java.util.Arrays; import java.util.Collections; +import java.util.List; +import java.util.concurrent.TimeUnit; +/** + * Google Sheets API client for appending feedback data. + * Uses modern GoogleCredentials with JSON key file and implements credential caching, + * retry logic, and thread-safe operations. + */ public class GoogleSheetsAPI { + private static final Logger logger = LoggerFactory.getLogger(GoogleSheetsAPI.class); - static final String spreadsheetId = "1B16qEbfp7SFCfIsZ8fcj7DneCy1WkR0GPh4t9L9NRSg"; - static final String duplicateCommentsSpreadsheetId = "1cR2mih5sBwl3wUjniwdyVA0xZcqV2Wl9yhghJfMG5oM"; // Template ID to - // be replaced - static final String range = "A1:A50000"; - private static final String APPLICATION_NAME = "My Google Sheets Application"; + // TODO: Externalize these to application.properties + static final String SPREADSHEET_ID = "1B16qEbfp7SFCfIsZ8fcj7DneCy1WkR0GPh4t9L9NRSg"; + static final String DUPLICATE_COMMENTS_SPREADSHEET_ID = "1cR2mih5sBwl3wUjniwdyVA0xZcqV2Wl9yhghJfMG5oM"; + static final String URL_RANGE = "A1:A50000"; + static final String DUPLICATE_RANGE = "A1:D50000"; + + private static final String APPLICATION_NAME = "Page Feedback CronJob"; private static final JsonFactory JSON_FACTORY = GsonFactory.getDefaultInstance(); - private static final String SERVICE_ACCOUNT_EMAIL = "cronjob@feedback-cj.iam.gserviceaccount.com"; + private static final String SERVICE_ACCOUNT_KEY_FILE = "service-account.json"; + + // Retry configuration + private static final int MAX_RETRY_ATTEMPTS = 3; + private static final long INITIAL_RETRY_DELAY_MS = 1000; + + // Cached Sheets service instance (thread-safe lazy initialization) + private static volatile Sheets sheetsService; + private static final Object lock = new Object(); + /** - * Global instance of the HTTP transport. + * Gets or creates a cached Sheets service instance. + * Thread-safe singleton pattern with double-checked locking. + * + * @return Sheets service instance + * @throws IOException if service account key file cannot be read + * @throws GeneralSecurityException if HTTP transport cannot be created */ - private static NetHttpTransport HTTP_TRANSPORT; + private static Sheets getSheetsService() throws IOException, GeneralSecurityException { + if (sheetsService == null) { + synchronized (lock) { + if (sheetsService == null) { + logger.debug("Initializing Google Sheets service"); + sheetsService = createSheetsService(); + } + } + } + return sheetsService; + } - public static void appendURL(String url) throws GeneralSecurityException, IOException { - KeyStore keystore = KeyStore.getInstance("PKCS12"); - keystore.load(GoogleSheetsAPI.class.getClassLoader().getResourceAsStream("service-account.p12"), - "notasecret".toCharArray()); - PrivateKey pk = (PrivateKey) keystore.getKey("privatekey", "notasecret".toCharArray()); + /** + * Creates a new Sheets service instance with modern GoogleCredentials. + * + * @return configured Sheets service + * @throws IOException if service account key file cannot be read + * @throws GeneralSecurityException if HTTP transport cannot be created + */ + private static Sheets createSheetsService() throws IOException, GeneralSecurityException { + NetHttpTransport httpTransport = GoogleNetHttpTransport.newTrustedTransport(); - final NetHttpTransport HTTP_TRANSPORT = GoogleNetHttpTransport.newTrustedTransport(); + GoogleCredentials credentials; + try (InputStream keyStream = GoogleSheetsAPI.class.getClassLoader() + .getResourceAsStream(SERVICE_ACCOUNT_KEY_FILE)) { - GoogleCredential credential = new GoogleCredential.Builder().setTransport(HTTP_TRANSPORT) - .setJsonFactory(JSON_FACTORY) - .setServiceAccountId(SERVICE_ACCOUNT_EMAIL) - .setServiceAccountScopes(Collections.singleton(SheetsScopes.SPREADSHEETS)) - .setServiceAccountPrivateKey(pk) - .build(); + if (keyStream == null) { + throw new IOException("Service account key file not found: " + SERVICE_ACCOUNT_KEY_FILE); + } + + credentials = GoogleCredentials.fromStream(keyStream) + .createScoped(Collections.singleton(SheetsScopes.SPREADSHEETS)); + } - Sheets service = new Sheets.Builder(HTTP_TRANSPORT, JSON_FACTORY, credential) + return new Sheets.Builder(httpTransport, JSON_FACTORY, new HttpCredentialsAdapter(credentials)) .setApplicationName(APPLICATION_NAME) .build(); + } - ValueRange appendBody = new ValueRange() - .setValues(Arrays.asList( - Arrays.asList(url))); - try { - AppendValuesResponse appendResult = service.spreadsheets().values() - .append(spreadsheetId, range, appendBody) - .setValueInputOption("USER_ENTERED") - .setInsertDataOption("INSERT_ROWS") - .setIncludeValuesInResponse(true) - .execute(); - } catch (IOException e) { - e.printStackTrace(); - } + /** + * Appends a URL to the main feedback spreadsheet with retry logic. + * + * @param url the URL to append + * @throws IOException if all retry attempts fail + * @throws GeneralSecurityException if unable to create HTTP transport + */ + public static void appendURL(String url) throws IOException, GeneralSecurityException { + logger.debug("Appending URL to spreadsheet: {}", url); + appendValues(SPREADSHEET_ID, URL_RANGE, Collections.singletonList(url)); } + /** + * Appends duplicate comment data to the duplicate comments spreadsheet with retry logic. + * + * @param date the date of the comment + * @param timestamp the timestamp of the comment + * @param url the URL associated with the comment + * @param comment the comment text + * @throws IOException if all retry attempts fail + * @throws GeneralSecurityException if unable to create HTTP transport + */ public static void appendDuplicateComment(String date, String timestamp, String url, String comment) - throws GeneralSecurityException, IOException { - KeyStore keystore = KeyStore.getInstance("PKCS12"); - keystore.load(GoogleSheetsAPI.class.getClassLoader().getResourceAsStream("service-account.p12"), - "notasecret".toCharArray()); - PrivateKey pk = (PrivateKey) keystore.getKey("privatekey", "notasecret".toCharArray()); - - final NetHttpTransport HTTP_TRANSPORT = GoogleNetHttpTransport.newTrustedTransport(); - - GoogleCredential credential = new GoogleCredential.Builder().setTransport(HTTP_TRANSPORT) - .setJsonFactory(JSON_FACTORY) - .setServiceAccountId(SERVICE_ACCOUNT_EMAIL) - .setServiceAccountScopes(Collections.singleton(SheetsScopes.SPREADSHEETS)) - .setServiceAccountPrivateKey(pk) - .build(); + throws IOException, GeneralSecurityException { + logger.debug("Appending duplicate comment - Date: {}, URL: {}", date, url); + appendValues(DUPLICATE_COMMENTS_SPREADSHEET_ID, DUPLICATE_RANGE, + Arrays.asList(date, timestamp, url, comment)); + } - Sheets service = new Sheets.Builder(HTTP_TRANSPORT, JSON_FACTORY, credential) - .setApplicationName(APPLICATION_NAME) - .build(); + /** + * Generic method to append values to a spreadsheet with exponential backoff retry. + * + * @param spreadsheetId the ID of the target spreadsheet + * @param range the A1 notation range + * @param values the values to append + * @throws IOException if all retry attempts fail + * @throws GeneralSecurityException if unable to create HTTP transport + */ + private static void appendValues(String spreadsheetId, String range, List values) + throws IOException, GeneralSecurityException { ValueRange appendBody = new ValueRange() - .setValues(Arrays.asList( - Arrays.asList(date, timestamp, url, comment))); - try { - AppendValuesResponse appendResult = service.spreadsheets().values() - .append(duplicateCommentsSpreadsheetId, "A1:D50000", appendBody) - .setValueInputOption("USER_ENTERED") - .setInsertDataOption("INSERT_ROWS") - .setIncludeValuesInResponse(true) - .execute(); - } catch (IOException e) { - e.printStackTrace(); + .setValues(Collections.singletonList(values)); + + IOException lastException = null; + + for (int attempt = 1; attempt <= MAX_RETRY_ATTEMPTS; attempt++) { + try { + Sheets service = getSheetsService(); + AppendValuesResponse response = service.spreadsheets().values() + .append(spreadsheetId, range, appendBody) + .setValueInputOption("USER_ENTERED") + .setInsertDataOption("INSERT_ROWS") + .setIncludeValuesInResponse(false) + .execute(); + + logger.debug("Successfully appended values to spreadsheet {} on attempt {}", + spreadsheetId, attempt); + return; // Success + + } catch (IOException e) { + lastException = e; + logger.warn("Attempt {}/{} failed to append to spreadsheet {}: {}", + attempt, MAX_RETRY_ATTEMPTS, spreadsheetId, e.getMessage()); + + if (attempt < MAX_RETRY_ATTEMPTS) { + long delayMs = INITIAL_RETRY_DELAY_MS * (long) Math.pow(2, attempt - 1); + logger.debug("Retrying in {} ms", delayMs); + try { + TimeUnit.MILLISECONDS.sleep(delayMs); + } catch (InterruptedException ie) { + Thread.currentThread().interrupt(); + throw new IOException("Retry interrupted", ie); + } + } + } + } + + // All retries failed + logger.error("Failed to append values to spreadsheet {} after {} attempts", + spreadsheetId, MAX_RETRY_ATTEMPTS, lastException); + throw lastException; + } + + /** + * Clears the cached Sheets service. Useful for testing or forcing re-initialization. + */ + static void clearCache() { + synchronized (lock) { + sheetsService = null; + logger.debug("Cleared cached Sheets service"); } } - public static void main(String[] args) throws GeneralSecurityException, IOException { - appendURL("test"); + /** + * Main method for testing. + */ + public static void main(String[] args) { + try { + appendURL("test-url"); + logger.info("Test append successful"); + } catch (Exception e) { + logger.error("Test append failed", e); + } } } diff --git a/src/main/java/ca/gc/tbs/Main.java b/src/main/java/ca/gc/tbs/Main.java index 2f8fb54..471ac66 100644 --- a/src/main/java/ca/gc/tbs/Main.java +++ b/src/main/java/ca/gc/tbs/Main.java @@ -1,694 +1,93 @@ package ca.gc.tbs; -import ca.gc.tbs.domain.Problem; -import ca.gc.tbs.domain.TopTaskSurvey; -import ca.gc.tbs.repository.ProblemRepository; -import ca.gc.tbs.repository.TopTaskRepository; -import ca.gc.tbs.service.ContentService; -import com.sybit.airtable.Airtable; -import com.sybit.airtable.Base; -import com.sybit.airtable.Table; -import org.apache.commons.csv.CSVFormat; -import org.apache.commons.csv.CSVRecord; -import org.apache.commons.lang3.StringUtils; -import org.apache.http.client.utils.URIBuilder; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.beans.factory.annotation.Value; import org.springframework.boot.CommandLineRunner; import org.springframework.boot.WebApplicationType; import org.springframework.boot.autoconfigure.SpringBootApplication; +import org.springframework.boot.autoconfigure.data.jpa.JpaRepositoriesAutoConfiguration; +import org.springframework.boot.autoconfigure.jdbc.DataSourceAutoConfiguration; +import org.springframework.boot.autoconfigure.orm.jpa.HibernateJpaAutoConfiguration; +import org.springframework.boot.autoconfigure.security.servlet.SecurityAutoConfiguration; import org.springframework.boot.builder.SpringApplicationBuilder; import org.springframework.context.annotation.ComponentScan; +import org.springframework.context.annotation.FilterType; import org.springframework.data.mongodb.datatables.DataTablesRepositoryFactoryBean; import org.springframework.data.mongodb.repository.config.EnableMongoRepositories; -import java.io.InputStreamReader; -import java.io.Reader; -import java.net.MalformedURLException; -import java.net.URISyntaxException; -import java.net.URL; -import java.net.URLEncoder; -import java.nio.charset.StandardCharsets; -import java.time.LocalDate; -import java.time.format.DateTimeFormatter; -import java.util.*; -import java.util.stream.Collectors; - -import static java.lang.System.exit; - -@SpringBootApplication -@ComponentScan(basePackages = {"ca.gc.tbs.domain", "ca.gc.tbs.repository", "ca.gc.tbs.service"}) -@EnableMongoRepositories(repositoryFactoryBeanClass = DataTablesRepositoryFactoryBean.class) +import ca.gc.tbs.service.AirtableSyncService; +import ca.gc.tbs.service.ProblemCleaningService; +import ca.gc.tbs.service.SpreadsheetService; +import ca.gc.tbs.service.TopTaskCleaningService; + +/** + * Main entry point for the feedback processing cron job. + * Orchestrates data cleaning and Airtable synchronization. + */ +@SpringBootApplication(exclude = { + SecurityAutoConfiguration.class, + DataSourceAutoConfiguration.class, + HibernateJpaAutoConfiguration.class, + JpaRepositoriesAutoConfiguration.class +}) +@ComponentScan( + basePackages = { + "ca.gc.tbs.service", + "ca.gc.tbs.repository", + "ca.gc.tbs.domain" + }, + excludeFilters = @ComponentScan.Filter( + type = FilterType.REGEX, + pattern = "ca\\.gc\\.tbs\\.service\\.(EmailService|ErrorKeywordService|ProblemCacheService|ProblemDateService|UserService)" + ) +) +@EnableMongoRepositories( + basePackages = "ca.gc.tbs.repository", + repositoryFactoryBeanClass = DataTablesRepositoryFactoryBean.class +) public class Main implements CommandLineRunner { private static final Logger logger = LoggerFactory.getLogger(Main.class); - // Tier 2 entries do not populate to AirTable. - private final Set tier2Spreadsheet = new HashSet<>(); - private final HashMap tier1Spreadsheet = new HashMap<>(); - - private final HashMap mainPageTitleIds = new HashMap<>(); - private final HashMap mainUrlLinkIds = new HashMap<>(); - private final HashMap mainMlTagIds = new HashMap<>(); - private final HashMap healthPageTitleIds = new HashMap<>(); - private final HashMap healthUrlLinkIds = new HashMap<>(); - private final HashMap healthMlTagIds = new HashMap<>(); + private final TopTaskCleaningService topTaskCleaningService; + private final ProblemCleaningService problemCleaningService; + private final SpreadsheetService spreadsheetService; + private final AirtableSyncService airtableSyncService; - private final HashMap CRA_PageTitleIds = new HashMap<>(); - private final HashMap CRA_UrlLinkIds = new HashMap<>(); - private final HashMap CRA_MlTagIds = new HashMap<>(); - - private final HashMap travelPageTitleIds = new HashMap<>(); - private final HashMap travelUrlLinkIds = new HashMap<>(); - private final HashMap travelMlTagIds = new HashMap<>(); - - private final HashMap IRCC_PageTitleIds = new HashMap<>(); - private final HashMap IRCC_UrlLinkIds = new HashMap<>(); - private final HashMap IRCC_MlTagIds = new HashMap<>(); - - @Autowired - private ContentService contentService; - @Autowired - private ProblemRepository problemRepository; - @Autowired - private TopTaskRepository topTaskRepository; - // Main AirTable - @Value("${airtable.key}") - private String airtableKey; - @Value("${airtable.tab}") - private String problemAirtableTab; - @Value("${airtable.pageTitleLookup}") - private String airtablePageTitleLookup; - @Value("${airtable.mlTags}") - private String airtableMLTags; - @Value("${airtable.URL_link}") - private String airtableURLLink; - @Value("${airtable.base}") - private String problemAirtableBase; - - // Health AirTable - @Value("${health.airtable.base}") - private String healthAirtableBase; - - // CRA AirTable - @Value("${cra.airtable.base}") - private String CRA_AirtableBase; - - // Travel AirTable - @Value("${travel.airtable.base}") - private String travelAirtableBase; - - // IRCC AirTable - @Value("${ircc.airtable.base}") - private String irccAirtableBase; - - private Base mainBase; - private Base healthBase; - private Base CRA_Base; - private Base travelBase; - private Base IRCC_Base; + public Main(TopTaskCleaningService topTaskCleaningService, + ProblemCleaningService problemCleaningService, + SpreadsheetService spreadsheetService, + AirtableSyncService airtableSyncService) { + this.topTaskCleaningService = topTaskCleaningService; + this.problemCleaningService = problemCleaningService; + this.spreadsheetService = spreadsheetService; + this.airtableSyncService = airtableSyncService; + } public static void main(String[] args) { - new SpringApplicationBuilder(Main.class).web(WebApplicationType.NONE) // .REACTIVE, .SERVLET + new SpringApplicationBuilder(Main.class) + .web(WebApplicationType.NONE) .run(args); } - // Main Loop, Runs all functions needed. @Override public void run(String... args) throws Exception { + airtableSyncService.initialize(); - Airtable airTableKey = new Airtable().configure(this.airtableKey); - - System.out.println("---------------------CONNECTING TO AIRTABLE BASES---------------------"); - this.mainBase = airTableKey.base(this.problemAirtableBase); - this.healthBase = airTableKey.base(this.healthAirtableBase); - this.CRA_Base = airTableKey.base(this.CRA_AirtableBase); - this.travelBase = airTableKey.base(this.travelAirtableBase); - this.IRCC_Base = airTableKey.base(this.irccAirtableBase); - - System.out.println("---------------------REMOVING PERSONAL INFO FROM TTS---------------------"); - this.removePersonalInfoExitSurvey(); - - System.out.println("---------------------REMOVING PERSONAL INFO FROM COMMENTS---------------------"); - this.removePersonalInfoProblems(); - - System.out.println("---------------------REMOVING JUNK DATA FROM TTS---------------------"); - this.removeJunkDataTTS(); - - System.out.println("---------------------IMPORTING SPREADSHEETS---------------------"); - this.importTier1(); - this.importTier2(); - - System.out.println("---------------------RETRIEVING AIRTABLE VALUES---------------------"); - this.getPageTitleIds(mainBase); - this.getPageTitleIds(healthBase); - this.getPageTitleIds(CRA_Base); - this.getPageTitleIds(travelBase); - this.getPageTitleIds(IRCC_Base); - - this.getMLTagIds(mainBase); - this.getMLTagIds(healthBase); - this.getMLTagIds(CRA_Base); - this.getMLTagIds(travelBase); - this.getMLTagIds(IRCC_Base); - - this.getURLLinkIds(mainBase); - this.getURLLinkIds(healthBase); - this.getURLLinkIds(CRA_Base); - this.getURLLinkIds(travelBase); - this.getURLLinkIds(IRCC_Base); - - System.out.println("---------------------AUTO TAGGING---------------------"); - this.autoTag(); - - System.out.println("---------------------AIRTABLE & SPREADSHEET SYNC---------------------"); - this.airTableSpreadsheetSync(); - - System.out.println("---------------------MARK AS PROCESSED ---------------------"); - this.completeProcessing(); - } - - // Scrubs tasks (Exit Survey) that have not been cleaned using the cleaning script - public void removePersonalInfoExitSurvey() { - List tList = this.topTaskRepository.findByPersonalInfoProcessed(null); - tList.addAll(this.topTaskRepository.findByPersonalInfoProcessed("false")); - System.out.println("Number of tasks to clean: " + tList.size()); - for (TopTaskSurvey task : tList) { - try { - if (task.getThemeOther() != null) { - String details = this.contentService.cleanContent(task.getThemeOther()); - task.setThemeOther(details); - } - if (task.getTaskOther() != null) { - String details = this.contentService.cleanContent(task.getTaskOther()); - task.setTaskOther(details); - } - if (task.getTaskImproveComment() != null) { - String details = this.contentService.cleanContent(task.getTaskImproveComment()); - task.setTaskImproveComment(details); - } - if (task.getTaskWhyNotComment() != null) { - String details = this.contentService.cleanContent(task.getTaskWhyNotComment()); - task.setTaskWhyNotComment(details); - } - task.setPersonalInfoProcessed("true"); - this.topTaskRepository.save(task); - } catch (Exception e) { - System.out.println("Could not process task: " + task.getId() + " : " + task.getDateTime() + " : " + task.getTaskOther() + " : " - + task.getTaskImproveComment() + " : " + task.getTaskWhyNotComment()); - } - } - System.out.println("Private info removed..."); - } - - // Scrubs comments that have not been cleaned using the cleaning script - public void removePersonalInfoProblems() { - List pList = this.problemRepository.findByPersonalInfoProcessed(null); - pList.addAll(this.problemRepository.findByPersonalInfoProcessed("false")); - System.out.println("Number of Problems to clean: " + pList.size()); - for (Problem problem : pList) { - try { - String details = this.contentService.cleanContent(problem.getProblemDetails()); - problem.setProblemDetails(details); - problem.setPersonalInfoProcessed("true"); - this.problemRepository.save(problem); - } catch (Exception e) { - System.out.println("Could not process problem:" + problem.getId() + ":" + problem.getProblemDetails()); - } - } - System.out.println("Private info removed..."); - } - - // Removes white space values from comments to improve the filter for write in comments on the Feedback-Viewer. - public void removeJunkDataTTS() { - List tList = this.topTaskRepository.findByProcessed("false"); - System.out.println("Amount of non processed entries (TTS) : " + tList.size()); - for (TopTaskSurvey task : tList) { - if (task == null || containsHTML(task.getTaskOther()) || containsHTML(task.getThemeOther()) || - containsHTML(task.getTaskImproveComment()) || containsHTML(task.getTaskWhyNotComment())) { - assert task != null; - System.out.println("Deleting task: " + task.getId() + " , Task was null or had a hyperlink, taskOther: " + task.getTaskOther() - + ", themeOther: " + task.getThemeOther() + ", taskWhyNotComment: " + task.getTaskWhyNotComment() + ", taskImproveComment: " + task.getTaskImproveComment()); - this.topTaskRepository.delete(task); - continue; - } - if (task.getTaskOther() != null && task.getTaskOther().trim().equals("") && task.getTaskOther().length() != 0) { - System.out.println("found junk data in taskOther."); - task.setTaskOther(""); - } - if (task.getThemeOther() != null && task.getThemeOther().trim().equals("") && task.getThemeOther().length() != 0) { - System.out.println("found junk data in themeOther."); - task.setThemeOther(""); - } - if (task.getTaskImproveComment() != null && task.getTaskImproveComment().trim().equals("") && task.getTaskImproveComment().length() != 0) { - System.out.println("found junk data in taskImproveComment."); - task.setTaskImproveComment(""); - } - if (task.getTaskWhyNotComment() != null && task.getTaskWhyNotComment().trim().equals("") && task.getTaskWhyNotComment().length() != 0) { - System.out.println("found junk data in taskWhyNotComment."); - task.setTaskWhyNotComment(""); - } - task.setProcessed("true"); - task.setProcessedDate(LocalDate.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd"))); - this.topTaskRepository.save(task); - } - } - - // Retrieves ALL model & bases from spreadsheet and imports them to the TIER 1 map. - public void importTier1() throws Exception { - final Reader reader = new InputStreamReader( - new URL("https://docs.google.com/spreadsheets/d/1eOmX_b8XCR9eLNxUbX3Gwkp2ywJ-vhapnC7ApdRbnSg/export?format=csv").openConnection() - .getInputStream(), - StandardCharsets.UTF_8); - final CSVFormat csvFormat = CSVFormat.Builder.create().setHeader().setAllowMissingColumnNames(true).build(); - final Iterable records = csvFormat.parse(reader); - try { - for (final CSVRecord record : records) { - try { - String[] modelBase = {record.get("MODEL"), record.get("BASE").toLowerCase()}; - tier1Spreadsheet.put(record.get("URL").toLowerCase(), modelBase); - } catch (Exception e) { - System.out.println(e.getMessage()); - e.printStackTrace(); - } - } - } finally { - reader.close(); - } - } - - // Retrieves ALL URLs from spreadsheet and imports them to the TIER 2 map - public void importTier2() throws Exception { - final Reader reader = new InputStreamReader( - new URL("https://docs.google.com/spreadsheets/d/1B16qEbfp7SFCfIsZ8fcj7DneCy1WkR0GPh4t9L9NRSg/export?format=csv").openConnection() - .getInputStream(), - StandardCharsets.UTF_8); - final CSVFormat csvFormat = CSVFormat.Builder.create().setHeader().setAllowMissingColumnNames(true).build(); - final Iterable records = csvFormat.parse(reader); - try { - for (final CSVRecord record : records) { - try { - tier2Spreadsheet.add(record.get("URL").toLowerCase()); - } catch (Exception e) { - System.out.println(e.getMessage()); - e.printStackTrace(); - } - } - } finally { - reader.close(); - } - } - - - // Retrieves Page feedback statistics page IDs and adds them to a hashmap for their respective AirTable base. - private void getPageTitleIds(Base base) throws Exception { - @SuppressWarnings("unchecked") - Table statsTable = base.table(this.airtablePageTitleLookup, AirTableStat.class); - List stats = statsTable.select(); - HashMap m = selectMapPageTitleIds(base); - stats.forEach(entry -> { - if (entry.getPageTitle() != null) { - try { - m.put(entry.getPageTitle().trim().toUpperCase(), entry.getId()); - } catch (Exception e) { - System.out.println(e.getMessage() + " Could not add Page Title ID: " + entry.getPageTitle() + " TO page title ID map."); - } - } - }); - } - - // Retrieves Page groups by URL and adds them to a hashmap for their respective AirTable base. - private void getURLLinkIds(Base base) throws Exception { - @SuppressWarnings("unchecked") - Table urlLinkTable = base.table(this.airtableURLLink, AirTableURLLink.class); - List urlLinks = urlLinkTable.select(); - HashMap m = selectMapUrlLinkIds(base); - urlLinks.forEach(entry -> { - if (entry.getURLlink() != null) { - try { - m.put(entry.getURLlink().trim().toUpperCase(), entry.getId()); - } catch (Exception e) { - System.out.println(e.getMessage() + " Could not add URL Link ID: " + entry.getURLlink() + " TO url link ID map."); - } - } - }); - } - - // Retrieves ML Tags and adds them to a hashmap for their respective AirTable base. - private void getMLTagIds(Base base) throws Exception { - @SuppressWarnings("unchecked") - Table tagsTable = base.table(airtableMLTags, AirTableMLTag.class); - List tags = tagsTable.select(); - HashMap m = selectMapMLTagIds(base); - tags.forEach(entry -> { - if (entry.getTag() != null) { - try { - m.put(entry.getTag().trim().toUpperCase(), entry.getId()); - } catch (Exception e) { - System.out.println(e.getMessage() + " Could not add ML Tag ID: " + entry.getTag() + " TO ML tag ID map."); - } - } - }); - } - - // Assigns tags to non-processed problems. - public void autoTag() { - List pList = this.problemRepository.findByAutoTagProcessed("false"); - pList.addAll(this.problemRepository.findByAutoTagProcessed(null)); - System.out.println("Amount of entries to be tagged: " + pList.size()); - for (Problem problem : pList) { - String model = ""; - try { - // If problem has comment, assign language & model. - if (!problem.getProblemDetails().trim().equals("")) { - String lang = "en"; - if (problem.getLanguage().equalsIgnoreCase("fr")) { - lang = "fr"; - } - - String text = URLEncoder.encode(problem.getProblemDetails(), StandardCharsets.UTF_8.name()); - String URL = removeQueryAndFragment(problem.getUrl()).toLowerCase(); - - if (tier1Spreadsheet.containsKey(URL)) { - model = tier1Spreadsheet.get(URL)[0]; - System.out.println("model: " + model); - } - // Then feed through the suggestion script (Feedback-Classification-RetroAction - // Repository) if model exists - // and assign tags if applicable. - if (!model.equals("")) { - Document doc = Jsoup - .connect( - "https://suggestion.tbs.alpha.canada.ca/suggestCategory?lang=" + lang + "&text=" + text + "§ion=" + model) - .maxBodySize(0).get(); - String tags = doc.select("body").html(); - System.out.println("Text:" + text + " : " + tags); - String[] splitTags = tags.split(","); - problem.getTags().addAll(Arrays.asList(splitTags)); - } - } - } catch (Exception e) { - System.out.println("Could not auto tag because:" + e.getMessage() + " model:" + model); - } - problem.setAutoTagProcessed("true"); - this.problemRepository.save(problem); - } - - } - - private void writeDuplicateToFile(String comment, String url, String date, String timeStamp) { - try { - GoogleSheetsAPI.appendDuplicateComment(date, timeStamp, url, comment); - } catch (Exception e) { - System.out.println("Error writing duplicate to spreadsheet: " + e.getMessage()); - e.printStackTrace(); - } - } - - - // Populates entries to the AirTable bases and Tier 2 spreadsheet (inventory). - @SuppressWarnings("unchecked") - public void airTableSpreadsheetSync() { - // Connect to AirTable bases - Table problemTable = mainBase.table(this.problemAirtableTab, AirTableProblemEnhanced.class); - Table healthTable = healthBase.table(this.problemAirtableTab, AirTableProblemEnhanced.class); - Table craTable = CRA_Base.table(this.problemAirtableTab, AirTableProblemEnhanced.class); - Table travelTable = travelBase.table(this.problemAirtableTab, AirTableProblemEnhanced.class); - Table irccTable = IRCC_Base.table(this.problemAirtableTab, AirTableProblemEnhanced.class); - // Find problems that have not been run through this function - Set seenComments = new HashSet<>(); - List pList = this.problemRepository.findByAirTableSync(null); - pList.addAll(this.problemRepository.findByAirTableSync("false")); - DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd"); - System.out.println("Connected to MongoDB & Airtable"); - System.out.println("Found " + pList.size() + " records to be processed on Date: " + LocalDate.now().format(formatter)); - int i = 1; - int maxToSync = 150; - for (Problem problem : pList) { - try { - if (i >= maxToSync) { - System.out.println("Sync only " + maxToSync + " records at a time..."); - break; - } - // In airTableSpreadsheetSync(), right after getting the pList: - - // Then in the for loop, before processing each problem: - String normalizedComment = problem.getProblemDetails().trim().toLowerCase(); - - if (seenComments.contains(normalizedComment)) { - System.out.println("Skipping duplicate comment: " + problem.getProblemDetails()); - writeDuplicateToFile(problem.getProblemDetails(), problem.getUrl(), - problem.getProblemDate() != null ? problem.getProblemDate() : LocalDate.now().format(formatter), problem.getTimeStamp()); - problem.setAirTableSync("true"); // Mark as processed - problemRepository.save(problem); - continue; - } - seenComments.add(normalizedComment); - - - boolean problemIsProcessed = problem.getPersonalInfoProcessed().equals("true") && problem.getAutoTagProcessed().equals("true"); - boolean junkComment = problem.getProblemDetails().trim().equals("") || containsHTML(problem.getProblemDetails()) - || problem.getUrl().equals("https://www.canada.ca/") || problem.getProblemDetails().length() > 301; - if (junkComment) { - System.out.println("Empty comment, deleting entry..."); - problemRepository.delete(problem); - continue; - } - String UTM_values = extractUtmValues(problem.getUrl()); - problem.setUrl(removeQueryAndFragment(problem.getUrl().toLowerCase())); - - // if tier 1 and tier 2 spreadsheet don't contain URL, add it to Tier 2 and set sync to true - if (!tier1Spreadsheet.containsKey(problem.getUrl()) && !tier2Spreadsheet.contains(problem.getUrl())) { - tier2Spreadsheet.add(problem.getUrl()); - GoogleSheetsAPI.appendURL(problem.getUrl()); - problem.setAirTableSync("true"); - System.out.println("Processed record : " + i + " url not in spreadsheet " + problem.getUrl() + ", Added url to Tier 2 Spreadsheet."); - } - // if tier 2 spreadsheet contains URL set AirTable sync to true // TIER 2 entries end here. - else if (tier2Spreadsheet.contains(problem.getUrl())) { - problem.setAirTableSync("true"); - System.out.println("Processed record : " + i + " (Tier 2) EXISTS ALREADY"); - } else { - AirTableProblemEnhanced airProblem = new AirTableProblemEnhanced(); - String base = tier1Spreadsheet.get(problem.getUrl())[1]; - - if (!selectMapUrlLinkIds(selectBase(base)).containsKey(problem.getUrl().trim().toUpperCase())) { - this.createUrlLinkEntry(problem.getUrl(), selectBase(base), airtableURLLink); - } - airProblem.getURLLinkIds().add(selectMapUrlLinkIds(selectBase(base)).get(problem.getUrl().trim().toUpperCase())); - - if (!selectMapPageTitleIds(selectBase(base)).containsKey(problem.getTitle().trim().toUpperCase())) { - this.createPageTitleEntry(problem.getTitle(), selectBase(base), airtablePageTitleLookup); - } - airProblem.getPageTitleIds().add(selectMapPageTitleIds(selectBase(base)).get(problem.getTitle().trim().toUpperCase())); - - for (String tag : problem.getTags()) { - String trimmedTag = tag.trim().toUpperCase(); - if (trimmedTag.isEmpty()) { - System.out.println("Empty tag encountered."); - } else if (selectMapMLTagIds(selectBase(base)).containsKey(trimmedTag)) { - airProblem.getTags().add(selectMapMLTagIds(selectBase(base)).get(trimmedTag)); - } else { - System.out.println("Missing tag id for:" + tag); - } - } - airProblem.setUTM(UTM_values); - setAirProblemAttributes(airProblem, problem); - - switch (base.toLowerCase()) { - case "main": - problemTable.create(airProblem); - break; - case "ircc": - irccTable.create(airProblem); - break; - case "travel": - travelTable.create(airProblem); - break; - case "cra": - craTable.create(airProblem); - break; - case "health": - healthTable.create(airProblem); - break; - } - problem.setAirTableSync("true"); - System.out.println("Processed record : " + i + " (Tier 1) Base: " + base.toUpperCase()); - } - i++; - this.problemRepository.save(problem); - } catch (Exception e) { - System.out.println(e.getMessage() + " Could not sync record : " + problem.getId() + " URL:" + problem.getUrl()); - } - } - } - - // Marks problems as processed if applicable - public void completeProcessing() { - List pList = this.problemRepository.findByProcessed("false"); - pList.addAll(this.problemRepository.findByProcessed(null)); - for (Problem problem : pList) { - try { - if (problem.getPersonalInfoProcessed().equals("true") && problem.getAutoTagProcessed().equals("true") - && problem.getAirTableSync().equals("true") && (problem.getProcessed() == null || problem.getProcessed().equals("false"))) { - problem.setProcessedDate(LocalDate.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd"))); - problem.setProcessed("true"); - this.problemRepository.save(problem); - } - } catch (Exception e) { - System.out.println("Could not mark completed because:" + e.getMessage() + ": ID:" + problem.getId()); - } - } - System.out.println("Finished processing..."); - exit(0); - } - - public Boolean containsHTML(String comment) { - if (comment == null) return false; - // This normalizeSpace call was added because sometimes sentences are written with extra spaces between words which triggers as HTML. - comment = StringUtils.normalizeSpace(comment); - String parsedComment = Jsoup.parse(comment).text().trim(); - return parsedComment.length() != comment.trim().length(); - } - - public String extractUtmValues(String url) throws URISyntaxException { - if (url == null) { - return ""; - } + logger.info("Cleaning TTS data (personal info + junk removal)"); + topTaskCleaningService.cleanTopTaskSurveys(); - try { - new URL(url).toURI(); // check if the URL is well-formed - } catch (MalformedURLException | URISyntaxException e) { - return ""; - } - - URIBuilder builder = new URIBuilder(url); - return builder.getQueryParams() - .stream() - .filter(x -> x.getName().startsWith("utm_")) - .map(x -> x.getName() + "=" + x.getValue()) - .collect(Collectors.joining("&")); - } + logger.info("Cleaning Problem data (personal info removal)"); + problemCleaningService.cleanProblems(); + logger.info("Importing spreadsheets"); + spreadsheetService.importTiers(); - public String removeQueryAndFragment(String url) { - try { - URIBuilder builder = new URIBuilder(url); - // Remove query and fragment - builder.clearParameters(); - builder.setFragment(null); - return builder.build().toString(); - } catch (Exception e) { - e.printStackTrace(); - return url; // Return the original URL if there's an exception - } - } - - - // Sets attributes. Made it into a function to make the code look a bit more readable. - public void setAirProblemAttributes(AirTableProblemEnhanced airProblem, Problem problem) { - airProblem.setUniqueID(problem.getId()); - airProblem.setDate(problem.getProblemDate()); - airProblem.setTimeStamp(problem.getTimeStamp()); - airProblem.setURL(problem.getUrl()); - airProblem.setLang(problem.getLanguage().toUpperCase()); - airProblem.setComment(problem.getProblemDetails()); - airProblem.setIgnore(null); - airProblem.setTagsConfirmed(null); - airProblem.setRefiningDetails(""); - airProblem.setActionable(null); - airProblem.setMainSection(problem.getSection()); - airProblem.setStatus("New"); - airProblem.setLookupTags(null); - airProblem.setInstitution(problem.getInstitution()); - airProblem.setTheme(problem.getTheme()); - airProblem.setId(null); - } - - // Creates record for new titles - private void createPageTitleEntry(String title, Base base, String pageTitle) throws Exception { - @SuppressWarnings("unchecked") - Table statsTable = base.table(pageTitle, AirTableStat.class); - AirTableStat stat = new AirTableStat(title.trim()); - stat = statsTable.create(stat); - HashMap basePageTitleMap = selectMapPageTitleIds(base); - basePageTitleMap.put(title.trim().toUpperCase(), stat.getId()); - } + logger.info("Airtable & spreadsheet sync"); + airtableSyncService.syncProblemsToAirtable(); - // Creates record for new URLs - private void createUrlLinkEntry(String url, Base base, String pageTitle) throws Exception { - @SuppressWarnings("unchecked") - Table urlLinkTable = base.table(pageTitle, AirTableURLLink.class); - AirTableURLLink urlLink = new AirTableURLLink(url.trim()); - urlLink = urlLinkTable.create(urlLink); - HashMap baseURLMap = selectMapUrlLinkIds(base); - baseURLMap.put(url.trim().toUpperCase(), urlLink.getId()); + logger.info("Mark as processed"); + airtableSyncService.completeProcessing(); } - - - public Base selectBase(String base) { - if (base.equalsIgnoreCase("main")) { - return mainBase; - } - if (base.equalsIgnoreCase("health")) { - return healthBase; - } - if (base.equalsIgnoreCase("cra")) { - return CRA_Base; - } - if (base.equalsIgnoreCase("ircc")) { - return IRCC_Base; - } - if (base.equalsIgnoreCase("travel")) { - return travelBase; - } - return null; - } - - public HashMap selectMapPageTitleIds(Base base) { - if (base.equals(mainBase)) - return this.mainPageTitleIds; - if (base.equals(healthBase)) - return this.healthPageTitleIds; - if (base.equals(CRA_Base)) - return this.CRA_PageTitleIds; - if (base.equals(travelBase)) - return this.travelPageTitleIds; - if (base.equals(IRCC_Base)) - return this.IRCC_PageTitleIds; - return null; - } - - public HashMap selectMapUrlLinkIds(Base base) { - if (base.equals(mainBase)) - return this.mainUrlLinkIds; - if (base.equals(healthBase)) - return this.healthUrlLinkIds; - if (base.equals(CRA_Base)) - return this.CRA_UrlLinkIds; - if (base.equals(travelBase)) - return this.travelUrlLinkIds; - if (base.equals(IRCC_Base)) - return this.IRCC_UrlLinkIds; - return null; - } - - public HashMap selectMapMLTagIds(Base base) { - if (base.equals(mainBase)) - return this.mainMlTagIds; - if (base.equals(healthBase)) - return this.healthMlTagIds; - if (base.equals(CRA_Base)) - return this.CRA_MlTagIds; - if (base.equals(travelBase)) - return this.travelMlTagIds; - if (base.equals(IRCC_Base)) - return this.IRCC_MlTagIds; - return null; - } - } diff --git a/src/main/java/ca/gc/tbs/AirTableProblemEnhanced.java b/src/main/java/ca/gc/tbs/model/AirTableProblemEnhanced.java similarity index 99% rename from src/main/java/ca/gc/tbs/AirTableProblemEnhanced.java rename to src/main/java/ca/gc/tbs/model/AirTableProblemEnhanced.java index 4cce1aa..1f754aa 100644 --- a/src/main/java/ca/gc/tbs/AirTableProblemEnhanced.java +++ b/src/main/java/ca/gc/tbs/model/AirTableProblemEnhanced.java @@ -1,4 +1,4 @@ -package ca.gc.tbs; +package ca.gc.tbs.model; import java.util.ArrayList; import java.util.List; diff --git a/src/main/java/ca/gc/tbs/service/AirtableSyncService.java b/src/main/java/ca/gc/tbs/service/AirtableSyncService.java new file mode 100644 index 0000000..422a1f6 --- /dev/null +++ b/src/main/java/ca/gc/tbs/service/AirtableSyncService.java @@ -0,0 +1,188 @@ +package ca.gc.tbs.service; + +import java.time.LocalDate; +import java.time.format.DateTimeFormatter; +import java.util.ArrayList; +import java.util.List; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.stereotype.Service; + +import com.sybit.airtable.Airtable; +import com.sybit.airtable.Base; +import com.sybit.airtable.Table; + +import ca.gc.tbs.domain.Problem; +import ca.gc.tbs.model.AirTableProblemEnhanced; +import ca.gc.tbs.repository.ProblemRepository; +import ca.gc.tbs.util.UrlUtils; + +/** + * Service for syncing Problem data to Airtable. + */ +@Service +public class AirtableSyncService { + private static final Logger logger = LoggerFactory.getLogger(AirtableSyncService.class); + + private static final int MAX_SYNC_RECORDS = 150; + + private final ProblemRepository problemRepository; + private final SpreadsheetService spreadsheetService; + + @Value("${airtable.key}") + private String airtableKey; + + @Value("${airtable.tab}") + private String problemAirtableTab; + + @Value("${airtable.base}") + private String problemAirtableBase; + + private Base mainBase; + private Table mainTable; + + @Autowired + public AirtableSyncService(ProblemRepository problemRepository, + SpreadsheetService spreadsheetService) { + this.problemRepository = problemRepository; + this.spreadsheetService = spreadsheetService; + } + + /** + * Initializes the Airtable connection. + */ + public void initialize() throws Exception { + logger.info("Connecting to Airtable base"); + Airtable airtable = new Airtable().configure(airtableKey); + mainBase = airtable.base(problemAirtableBase); + mainTable = mainBase.table(problemAirtableTab, AirTableProblemEnhanced.class); + } + + /** + * Syncs unprocessed problems to Airtable based on tier classification. + */ + public void syncProblemsToAirtable() { + List problems = fetchUnprocessedProblems(); + List toSave = new ArrayList<>(); + + logger.info("Found {} records to be processed on Date: {}", + problems.size(), LocalDate.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd"))); + + int processedCount = 0; + for (Problem problem : problems) { + if (processedCount >= MAX_SYNC_RECORDS) { + logger.info("Reached sync limit of {} records", MAX_SYNC_RECORDS); + break; + } + + SyncResult result = processProblem(problem); + if (result == SyncResult.SAVE) { + toSave.add(problem); + processedCount++; + } + } + + // Batch database operations + if (!toSave.isEmpty()) { + problemRepository.saveAll(toSave); + logger.info("Batch saved {} problems", toSave.size()); + } + } + + /** + * Marks all processed problems as complete. + */ + public void completeProcessing() { + List problems = problemRepository.findByProcessed("false"); + problems.addAll(problemRepository.findByProcessed(null)); + + for (Problem problem : problems) { + try { + if ("true".equals(problem.getPersonalInfoProcessed()) + && "true".equals(problem.getAirTableSync()) + && (problem.getProcessed() == null || "false".equals(problem.getProcessed()))) { + problem.setProcessedDate(LocalDate.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd"))); + problem.setProcessed("true"); + problemRepository.save(problem); + } + } catch (Exception e) { + logger.error("Could not mark completed - ID: {}", problem.getId(), e); + } + } + logger.info("Finished processing - all records marked complete"); + } + + private enum SyncResult { SAVE, SKIP } + + private List fetchUnprocessedProblems() { + List problems = problemRepository.findByAirTableSync(null); + problems.addAll(problemRepository.findByAirTableSync("false")); + return problems; + } + + private SyncResult processProblem(Problem problem) { + try { + // Extract UTM values BEFORE removing query params + String utmValues = UrlUtils.extractUtmValues(problem.getUrl()); + + // Normalize URL for tier comparisons + problem.setUrl(UrlUtils.removeQueryAndFragment(problem.getUrl().toLowerCase())); + + routeProblem(problem, utmValues); + return SyncResult.SAVE; + + } catch (Exception e) { + logger.error("Could not sync record: {} - URL: {}", problem.getId(), problem.getUrl(), e); + return SyncResult.SKIP; + } + } + + private void routeProblem(Problem problem, String utmValues) throws Exception { + String url = problem.getUrl(); + + if (!spreadsheetService.isTier1Url(url) && !spreadsheetService.isTier2Url(url)) { + spreadsheetService.addUrlToTier2(problem); + } else if (spreadsheetService.isTier2Url(url)) { + markAsProcessed(problem); + } else { + syncProblemToAirtable(problem, utmValues); + } + } + + private void syncProblemToAirtable(Problem problem, String utmValues) throws Exception { + AirTableProblemEnhanced airProblem = createAirTableProblem(problem, utmValues); + mainTable.create(airProblem); + problem.setAirTableSync("true"); + logger.info("Synced to Airtable (Tier 1): {}", problem.getUrl()); + } + + private void markAsProcessed(Problem problem) { + problem.setAirTableSync("true"); + logger.debug("Tier 2 URL already exists: {}", problem.getUrl()); + } + + private AirTableProblemEnhanced createAirTableProblem(Problem problem, String utmValues) { + AirTableProblemEnhanced airProblem = new AirTableProblemEnhanced(); + airProblem.setUTM(utmValues); + airProblem.setUniqueID(problem.getId()); + airProblem.setDate(problem.getProblemDate()); + airProblem.setTimeStamp(problem.getTimeStamp()); + airProblem.setURL(problem.getUrl()); + airProblem.setLang(problem.getLanguage().toUpperCase()); + airProblem.setComment(problem.getProblemDetails()); + airProblem.setIgnore(null); + airProblem.setTagsConfirmed(null); + airProblem.setRefiningDetails(""); + airProblem.setActionable(null); + airProblem.setMainSection(problem.getSection()); + airProblem.setStatus("New"); + airProblem.setLookupTags(null); + airProblem.setInstitution(problem.getInstitution()); + airProblem.setTheme(problem.getTheme()); + airProblem.setId(null); + return airProblem; + } +} diff --git a/src/main/java/ca/gc/tbs/service/ProblemCleaningService.java b/src/main/java/ca/gc/tbs/service/ProblemCleaningService.java new file mode 100644 index 0000000..f2c9e9c --- /dev/null +++ b/src/main/java/ca/gc/tbs/service/ProblemCleaningService.java @@ -0,0 +1,102 @@ +package ca.gc.tbs.service; + +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Service; + +import ca.gc.tbs.domain.Problem; +import ca.gc.tbs.repository.ProblemRepository; +import ca.gc.tbs.util.ValidationUtils; +import ca.gc.tbs.service.BadWords; + +/** + * Service for cleaning Problem data. + */ +@Service +public class ProblemCleaningService { + private static final Logger logger = LoggerFactory.getLogger(ProblemCleaningService.class); + + private static final int MAX_COMMENT_LENGTH = 301; + + private final ContentService contentService; + private final ProblemRepository problemRepository; + private final SpreadsheetService spreadsheetService; + + @Autowired + public ProblemCleaningService(ContentService contentService, + ProblemRepository problemRepository, + SpreadsheetService spreadsheetService, + BadWords badWords) { + // Load BadWords config (JAR uses javax.annotation.PostConstruct which Spring Boot 3.x ignores) + badWords.loadConfigs(); + + this.contentService = contentService; + this.problemRepository = problemRepository; + this.spreadsheetService = spreadsheetService; + } + + /** + * Cleans all unprocessed Problem entries. + * Removes junk/duplicates and cleans personal info from valid records. + */ + public void cleanProblems() { + Set seenComments = new HashSet<>(); + List problems = fetchUncleanedProblems(); + logger.info("Number of Problems to clean: {}", problems.size()); + + for (Problem problem : problems) { + try { + processProblem(problem, seenComments); + } catch (Exception e) { + logger.error("Could not process problem: {} - Details: {}", + problem.getId(), problem.getProblemDetails(), e); + } + } + logger.info("Problem cleaning complete"); + } + + private List fetchUncleanedProblems() { + List problems = problemRepository.findByPersonalInfoProcessed(null); + problems.addAll(problemRepository.findByPersonalInfoProcessed("false")); + return problems; + } + + private void processProblem(Problem problem, Set seenComments) { + // Check for junk first - delete immediately without wasting time cleaning + if (isJunkComment(problem)) { + logger.info("Deleting junk comment: {}", problem.getId()); + problemRepository.delete(problem); + return; + } + + // Check for duplicates within this batch + String normalizedComment = problem.getProblemDetails().trim().toLowerCase(); + if (ValidationUtils.isDuplicateComment(normalizedComment, seenComments)) { + logger.info("Deleting duplicate comment: {}", problem.getProblemDetails()); + spreadsheetService.logDuplicateComment(problem); + problemRepository.delete(problem); + return; + } + seenComments.add(normalizedComment); + + // Clean personal info from valid, non-duplicate records + String details = contentService.cleanContent(problem.getProblemDetails()); + problem.setProblemDetails(details); + problem.setPersonalInfoProcessed("true"); + problemRepository.save(problem); + } + + private boolean isJunkComment(Problem problem) { + String details = problem.getProblemDetails(); + String url = problem.getUrl(); + return details.trim().isEmpty() + || ValidationUtils.containsHTML(details) + || "https://www.canada.ca/".equals(url) + || details.length() > MAX_COMMENT_LENGTH; + } +} diff --git a/src/main/java/ca/gc/tbs/service/SpreadsheetService.java b/src/main/java/ca/gc/tbs/service/SpreadsheetService.java new file mode 100644 index 0000000..ee223fa --- /dev/null +++ b/src/main/java/ca/gc/tbs/service/SpreadsheetService.java @@ -0,0 +1,125 @@ +package ca.gc.tbs.service; + +import java.io.InputStreamReader; +import java.io.Reader; +import java.net.URL; +import java.nio.charset.StandardCharsets; +import java.time.LocalDate; +import java.time.format.DateTimeFormatter; +import java.util.HashSet; +import java.util.Set; +import java.util.function.Consumer; + +import org.apache.commons.csv.CSVFormat; +import org.apache.commons.csv.CSVRecord; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.stereotype.Service; + +import ca.gc.tbs.GoogleSheetsAPI; +import ca.gc.tbs.domain.Problem; + +/** + * Service for managing tier spreadsheet operations. + */ +@Service +public class SpreadsheetService { + private static final Logger logger = LoggerFactory.getLogger(SpreadsheetService.class); + + private static final String TIER1_SPREADSHEET_URL = + "https://docs.google.com/spreadsheets/d/1eOmX_b8XCR9eLNxUbX3Gwkp2ywJ-vhapnC7ApdRbnSg/export?format=csv"; + private static final String TIER2_SPREADSHEET_URL = + "https://docs.google.com/spreadsheets/d/1B16qEbfp7SFCfIsZ8fcj7DneCy1WkR0GPh4t9L9NRSg/export?format=csv"; + + private final Set tier1Urls = new HashSet<>(); + private final Set tier2Urls = new HashSet<>(); + + /** + * Imports both Tier 1 and Tier 2 spreadsheets. + */ + public void importTiers() throws Exception { + importTier1(); + importTier2(); + } + + /** + * Imports Tier 1 URLs from the spreadsheet. + */ + public void importTier1() throws Exception { + parseCsvFromUrl(TIER1_SPREADSHEET_URL, record -> { + tier1Urls.add(record.get("URL").toLowerCase()); + }, "Tier 1"); + logger.info("Imported {} Tier 1 URLs", tier1Urls.size()); + } + + /** + * Imports Tier 2 URLs from the spreadsheet. + */ + public void importTier2() throws Exception { + parseCsvFromUrl(TIER2_SPREADSHEET_URL, record -> { + tier2Urls.add(record.get("URL").toLowerCase()); + }, "Tier 2"); + logger.info("Imported {} Tier 2 URLs", tier2Urls.size()); + } + + /** + * Checks if a URL is in the Tier 1 spreadsheet. + */ + public boolean isTier1Url(String url) { + return tier1Urls.contains(url); + } + + /** + * Checks if a URL is in the Tier 2 spreadsheet. + */ + public boolean isTier2Url(String url) { + return tier2Urls.contains(url); + } + + /** + * Logs a duplicate comment to the Google Sheets duplicate tracker. + */ + public void logDuplicateComment(Problem problem) { + try { + String date = problem.getProblemDate() != null + ? problem.getProblemDate() + : LocalDate.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd")); + GoogleSheetsAPI.appendDuplicateComment(date, problem.getTimeStamp(), + problem.getUrl(), problem.getProblemDetails()); + } catch (Exception e) { + logger.error("Error writing duplicate to spreadsheet", e); + } + } + + /** + * Adds a URL to the Tier 2 spreadsheet (both local cache and Google Sheets). + */ + public void addUrlToTier2(Problem problem) throws Exception { + tier2Urls.add(problem.getUrl()); + GoogleSheetsAPI.appendURL(problem.getUrl()); + problem.setAirTableSync("true"); + logger.info("URL not in spreadsheet: {}, added to Tier 2 Spreadsheet", problem.getUrl()); + } + + private void parseCsvFromUrl(String url, Consumer recordProcessor, String tierName) + throws Exception { + try (Reader reader = new InputStreamReader( + new URL(url).openConnection().getInputStream(), + StandardCharsets.UTF_8)) { + + final CSVFormat csvFormat = CSVFormat.Builder.create() + .setHeader() + .setAllowMissingColumnNames(true) + .build(); + final Iterable records = csvFormat.parse(reader); + + for (final CSVRecord record : records) { + try { + recordProcessor.accept(record); + } catch (Exception e) { + logger.error("Error importing {} spreadsheet record", tierName, e); + } + } + } + } +} diff --git a/src/main/java/ca/gc/tbs/service/TopTaskCleaningService.java b/src/main/java/ca/gc/tbs/service/TopTaskCleaningService.java new file mode 100644 index 0000000..6641381 --- /dev/null +++ b/src/main/java/ca/gc/tbs/service/TopTaskCleaningService.java @@ -0,0 +1,102 @@ +package ca.gc.tbs.service; + +import java.time.LocalDate; +import java.time.format.DateTimeFormatter; +import java.util.List; +import java.util.function.Consumer; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Service; + +import ca.gc.tbs.domain.TopTaskSurvey; +import ca.gc.tbs.repository.TopTaskRepository; +import ca.gc.tbs.util.ValidationUtils; + +/** + * Service for cleaning Top Task Survey data. + */ +@Service +public class TopTaskCleaningService { + private static final Logger logger = LoggerFactory.getLogger(TopTaskCleaningService.class); + + private final ContentService contentService; + private final TopTaskRepository topTaskRepository; + + @Autowired + public TopTaskCleaningService(ContentService contentService, TopTaskRepository topTaskRepository) { + this.contentService = contentService; + this.topTaskRepository = topTaskRepository; + } + + /** + * Cleans all unprocessed Top Task Survey entries. + * Removes junk entries and cleans personal info from valid records. + */ + public void cleanTopTaskSurveys() { + List tasks = topTaskRepository.findByProcessed("false"); + logger.info("Number of TTS entries to clean: {}", tasks.size()); + + for (TopTaskSurvey task : tasks) { + try { + processTask(task); + } catch (Exception e) { + logger.error("Could not process task: {} - DateTime: {}", + task.getId(), task.getDateTime(), e); + } + } + logger.info("TTS cleaning complete"); + } + + private void processTask(TopTaskSurvey task) { + // Check for junk first - delete immediately without wasting time cleaning + if (task == null) { + logger.warn("Skipping null task"); + return; + } + if (hasHTMLInAnyField(task)) { + logger.warn("Deleting junk task: {} - Had null or hyperlink", task.getId()); + topTaskRepository.delete(task); + return; + } + + // Trim whitespace + trimWhitespaceField(task.getTaskOther(), task::setTaskOther, "taskOther"); + trimWhitespaceField(task.getThemeOther(), task::setThemeOther, "themeOther"); + trimWhitespaceField(task.getTaskImproveComment(), task::setTaskImproveComment, "taskImproveComment"); + trimWhitespaceField(task.getTaskWhyNotComment(), task::setTaskWhyNotComment, "taskWhyNotComment"); + + // Clean personal info from valid records + cleanTaskField(task.getThemeOther(), task::setThemeOther); + cleanTaskField(task.getTaskOther(), task::setTaskOther); + cleanTaskField(task.getTaskImproveComment(), task::setTaskImproveComment); + cleanTaskField(task.getTaskWhyNotComment(), task::setTaskWhyNotComment); + + task.setPersonalInfoProcessed("true"); + task.setProcessed("true"); + task.setProcessedDate(LocalDate.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd"))); + topTaskRepository.save(task); + } + + private void cleanTaskField(String fieldValue, Consumer setter) { + if (fieldValue != null) { + String cleaned = contentService.cleanContent(fieldValue); + setter.accept(cleaned); + } + } + + private boolean hasHTMLInAnyField(TopTaskSurvey task) { + return ValidationUtils.containsHTML(task.getTaskOther()) + || ValidationUtils.containsHTML(task.getThemeOther()) + || ValidationUtils.containsHTML(task.getTaskImproveComment()) + || ValidationUtils.containsHTML(task.getTaskWhyNotComment()); + } + + private void trimWhitespaceField(String fieldValue, Consumer setter, String fieldName) { + if (fieldValue != null && fieldValue.trim().isEmpty() && !fieldValue.isEmpty()) { + logger.debug("Found junk data in {}", fieldName); + setter.accept(""); + } + } +} diff --git a/src/main/java/ca/gc/tbs/util/UrlUtils.java b/src/main/java/ca/gc/tbs/util/UrlUtils.java new file mode 100644 index 0000000..cdda574 --- /dev/null +++ b/src/main/java/ca/gc/tbs/util/UrlUtils.java @@ -0,0 +1,69 @@ +package ca.gc.tbs.util; + +import java.net.MalformedURLException; +import java.net.URISyntaxException; +import java.net.URL; +import java.util.stream.Collectors; + +import org.apache.hc.core5.net.URIBuilder; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Utility class for URL manipulation operations. + */ +public final class UrlUtils { + private static final Logger logger = LoggerFactory.getLogger(UrlUtils.class); + + private UrlUtils() { + // Utility class - prevent instantiation + } + + /** + * Extracts UTM parameters from a URL and returns them as a query string. + * + * @param url the URL to extract UTM values from + * @return UTM parameters as "utm_x=value&utm_y=value" or empty string if none found + */ + public static String extractUtmValues(String url) { + if (url == null) { + return ""; + } + + try { + new URL(url).toURI(); + } catch (MalformedURLException | URISyntaxException e) { + return ""; + } + + try { + URIBuilder builder = new URIBuilder(url); + return builder.getQueryParams() + .stream() + .filter(x -> x.getName().startsWith("utm_")) + .map(x -> x.getName() + "=" + x.getValue()) + .collect(Collectors.joining("&")); + } catch (URISyntaxException e) { + logger.error("Error extracting UTM values from URL: {}", url, e); + return ""; + } + } + + /** + * Removes query parameters and fragment from a URL. + * + * @param url the URL to clean + * @return URL without query parameters and fragment, or original URL if parsing fails + */ + public static String removeQueryAndFragment(String url) { + try { + URIBuilder builder = new URIBuilder(url); + builder.clearParameters(); + builder.setFragment(null); + return builder.build().toString(); + } catch (Exception e) { + logger.error("Error removing query and fragment from URL: {}", url, e); + return url; + } + } +} diff --git a/src/main/java/ca/gc/tbs/util/ValidationUtils.java b/src/main/java/ca/gc/tbs/util/ValidationUtils.java new file mode 100644 index 0000000..2d75b3a --- /dev/null +++ b/src/main/java/ca/gc/tbs/util/ValidationUtils.java @@ -0,0 +1,40 @@ +package ca.gc.tbs.util; + +import java.util.Set; + +import org.apache.commons.lang3.StringUtils; +import org.jsoup.Jsoup; + +/** + * Utility class for content validation operations. + */ +public final class ValidationUtils { + + private ValidationUtils() { + // Utility class - prevent instantiation + } + + /** + * Checks if the given text contains HTML markup. + * + * @param text the text to check + * @return true if HTML is detected, false otherwise + */ + public static boolean containsHTML(String text) { + if (text == null) return false; + text = StringUtils.normalizeSpace(text); + String parsedText = Jsoup.parse(text).text().trim(); + return parsedText.length() != text.trim().length(); + } + + /** + * Checks if a normalized comment already exists in the seen comments set. + * + * @param normalizedComment the comment normalized to lowercase and trimmed + * @param seenComments set of previously seen comments + * @return true if duplicate, false otherwise + */ + public static boolean isDuplicateComment(String normalizedComment, Set seenComments) { + return seenComments.contains(normalizedComment); + } +} diff --git a/src/main/resources/application.properties.gpg b/src/main/resources/application.properties.gpg index c91772c..17b3837 100644 Binary files a/src/main/resources/application.properties.gpg and b/src/main/resources/application.properties.gpg differ diff --git a/src/main/resources/service-account.json.gpg b/src/main/resources/service-account.json.gpg new file mode 100644 index 0000000..4040b55 Binary files /dev/null and b/src/main/resources/service-account.json.gpg differ diff --git a/src/main/resources/service-account.p12.gpg b/src/main/resources/service-account.p12.gpg deleted file mode 100644 index 1293119..0000000 Binary files a/src/main/resources/service-account.p12.gpg and /dev/null differ