From beb4597265888133717494332eb67905d68ba53c Mon Sep 17 00:00:00 2001 From: dheeraj-kholia-cs Date: Thu, 30 Jan 2025 14:43:31 +0530 Subject: [PATCH 1/6] Error management for row excel plugin --- .../source/ExcelErrorDetailsProvider.java | 26 +++++++++++++++++++ .../plugin/batch/source/ExcelInputFormat.java | 7 ++++- .../plugin/batch/source/ExcelInputReader.java | 17 +++++++++--- 3 files changed, 46 insertions(+), 4 deletions(-) create mode 100644 core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelErrorDetailsProvider.java diff --git a/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelErrorDetailsProvider.java b/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelErrorDetailsProvider.java new file mode 100644 index 000000000..4c0d416fc --- /dev/null +++ b/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelErrorDetailsProvider.java @@ -0,0 +1,26 @@ +/* + * Copyright © 2025 Cask Data, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package io.cdap.plugin.batch.source; + +import io.cdap.plugin.common.HydratorErrorDetailsProvider; + +/** + * ExcelErrorDetailsProvider provider + */ +public class ExcelErrorDetailsProvider extends HydratorErrorDetailsProvider { + +} diff --git a/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelInputFormat.java b/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelInputFormat.java index a7e46a30f..5b38642b3 100644 --- a/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelInputFormat.java +++ b/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelInputFormat.java @@ -19,6 +19,9 @@ import com.github.pjfanning.xlsx.StreamingReader; import com.google.common.base.Preconditions; import com.google.common.base.Strings; +import io.cdap.cdap.api.exception.ErrorCategory; +import io.cdap.cdap.api.exception.ErrorType; +import io.cdap.cdap.api.exception.ErrorUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; @@ -193,7 +196,9 @@ public void initialize(InputSplit genericSplit, TaskAttemptContext context) thro workSheet = workbook.getSheetAt(Integer.parseInt(sheetValue)); } } catch (Exception e) { - throw new IllegalArgumentException("Exception while reading excel sheet. " + e.getMessage(), e); + String error = String.format("Exception while reading excel sheet: %s", e.getMessage()); + throw ErrorUtils.getProgramFailureException(new ErrorCategory(ErrorCategory.ErrorCategoryEnum.PLUGIN), + error, error, ErrorType.USER, false, null); } // As we cannot get the number of rows in a sheet while streaming. diff --git a/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelInputReader.java b/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelInputReader.java index 38157d85e..15651673a 100644 --- a/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelInputReader.java +++ b/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelInputReader.java @@ -35,12 +35,16 @@ import io.cdap.cdap.api.dataset.lib.KeyValue; import io.cdap.cdap.api.dataset.lib.KeyValueTable; import io.cdap.cdap.api.dataset.table.Table; +import io.cdap.cdap.api.exception.ErrorCategory; +import io.cdap.cdap.api.exception.ErrorType; +import io.cdap.cdap.api.exception.ErrorUtils; import io.cdap.cdap.etl.api.Emitter; import io.cdap.cdap.etl.api.FailureCollector; import io.cdap.cdap.etl.api.PipelineConfigurer; import io.cdap.cdap.etl.api.batch.BatchRuntimeContext; import io.cdap.cdap.etl.api.batch.BatchSource; import io.cdap.cdap.etl.api.batch.BatchSourceContext; +import io.cdap.cdap.etl.api.exception.ErrorDetailsProviderSpec; import io.cdap.plugin.common.LineageRecorder; import io.cdap.plugin.common.Properties; import io.cdap.plugin.common.ReferencePluginConfig; @@ -192,10 +196,11 @@ public void transform(KeyValue input, Emitter 1 && excelInputreaderConfig.terminateIfEmptyRow.equalsIgnoreCase("true")) { - throw new ExecutionException("Encountered empty row while reading Excel file :" + fileName + - " . Terminating processing", new Throwable()); + String error = String.format("Encountered empty row while reading Excel file :%s." + + " Terminating processing", fileName); + throw ErrorUtils.getProgramFailureException(new ErrorCategory(ErrorCategory.ErrorCategoryEnum.PLUGIN), + error, error, ErrorType.USER, false, null); } - prevRowNum = currentRowNum; Map excelColumnValueMap = new HashMap<>(); @@ -212,6 +217,9 @@ public void transform(KeyValue input, Emitter Date: Thu, 30 Jan 2025 15:07:16 +0530 Subject: [PATCH 2/6] Corrected cause for exception --- .../main/java/io/cdap/plugin/batch/source/ExcelInputFormat.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelInputFormat.java b/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelInputFormat.java index 5b38642b3..2db71a674 100644 --- a/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelInputFormat.java +++ b/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelInputFormat.java @@ -198,7 +198,7 @@ public void initialize(InputSplit genericSplit, TaskAttemptContext context) thro } catch (Exception e) { String error = String.format("Exception while reading excel sheet: %s", e.getMessage()); throw ErrorUtils.getProgramFailureException(new ErrorCategory(ErrorCategory.ErrorCategoryEnum.PLUGIN), - error, error, ErrorType.USER, false, null); + error, error, ErrorType.USER, false, e); } // As we cannot get the number of rows in a sheet while streaming. From 9383bd2cba65bcaaea97a2bf61b8307ce4d5f972 Mon Sep 17 00:00:00 2001 From: dheeraj-kholia-cs Date: Thu, 6 Feb 2025 10:57:00 +0530 Subject: [PATCH 3/6] Added more exception handling --- .../source/ExcelErrorDetailsProvider.java | 71 ++++++++++++++++++- .../plugin/batch/source/ExcelInputReader.java | 6 +- 2 files changed, 72 insertions(+), 5 deletions(-) diff --git a/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelErrorDetailsProvider.java b/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelErrorDetailsProvider.java index 4c0d416fc..1da18369d 100644 --- a/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelErrorDetailsProvider.java +++ b/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelErrorDetailsProvider.java @@ -16,11 +16,76 @@ package io.cdap.plugin.batch.source; -import io.cdap.plugin.common.HydratorErrorDetailsProvider; +import com.github.pjfanning.xlsx.exceptions.MissingSheetException; +import com.github.pjfanning.xlsx.exceptions.ReadException; +import com.google.common.base.Throwables; + +import io.cdap.cdap.api.exception.ErrorCategory; +import io.cdap.cdap.api.exception.ErrorType; +import io.cdap.cdap.api.exception.ErrorUtils; +import io.cdap.cdap.api.exception.ProgramFailureException; +import io.cdap.cdap.etl.api.exception.ErrorContext; +import io.cdap.cdap.etl.api.exception.ErrorDetailsProvider; + +import org.apache.poi.EmptyFileException; + +import java.io.IOException; +import java.util.List; + +import javax.annotation.Nullable; /** * ExcelErrorDetailsProvider provider */ -public class ExcelErrorDetailsProvider extends HydratorErrorDetailsProvider { - +public class ExcelErrorDetailsProvider implements ErrorDetailsProvider { + + private static final String ERROR_MESSAGE_FORMAT = "Error occurred in the phase: '%s'. Error message: %s"; + + @Nullable + @Override + public ProgramFailureException getExceptionDetails(Exception e, ErrorContext errorContext) { + List causalChain = Throwables.getCausalChain(e); + for (Throwable t : causalChain) { + if (t instanceof ProgramFailureException) { + // if causal chain already has program failure exception, return null to avoid double wrap. + return null; + } + if (t instanceof MissingSheetException) { + return getProgramFailureException((MissingSheetException) t, errorContext, + ErrorType.USER); + } + if (t instanceof ReadException) { + return getProgramFailureException((ReadException) t, errorContext, + ErrorType.USER); + } + if (t instanceof EmptyFileException) { + return getProgramFailureException((EmptyFileException) t, errorContext, + ErrorType.USER); + } + if (t instanceof IllegalArgumentException) { + return getProgramFailureException((IllegalArgumentException) t, errorContext, + ErrorType.USER); + } + if (t instanceof IOException) { + return getProgramFailureException((IOException) t, errorContext, + ErrorType.USER); + } + } + return null; + } + + /** + * Get a ProgramFailureException with the given error information from {@link Exception}. + * + * @param exception The Exception to get the error information from. + * @return A ProgramFailureException with the given error information. + */ + private ProgramFailureException getProgramFailureException(Exception exception, + ErrorContext errorContext, ErrorType errorType) { + String errorMessage = exception.getMessage(); + return ErrorUtils.getProgramFailureException( + new ErrorCategory(ErrorCategory.ErrorCategoryEnum.PLUGIN), errorMessage, + String.format(ERROR_MESSAGE_FORMAT, errorContext.getPhase(), errorMessage), errorType, + false, exception); + } } diff --git a/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelInputReader.java b/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelInputReader.java index 15651673a..39e2b479c 100644 --- a/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelInputReader.java +++ b/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelInputReader.java @@ -453,8 +453,10 @@ private void getOutputSchema() { } } } catch (Exception e) { - throw new IllegalArgumentException("Exception while creating output schema for Excel input reader. " + - "Invalid output " + "schema: " + e.getMessage(), e); + String error = String.format("Exception while creating output schema for Excel input reader. " + + "Invalid output " + "schema: %s", e.getMessage()); + throw ErrorUtils.getProgramFailureException(new ErrorCategory(ErrorCategory.ErrorCategoryEnum.PLUGIN), + error, error, ErrorType.USER, false, e); } outputSchema = Schema.recordOf("outputSchema", outputFields); } From 3990f6cd28ce784a84004a48f3f410b8317a21f6 Mon Sep 17 00:00:00 2001 From: dheeraj-kholia-cs Date: Thu, 6 Feb 2025 11:51:10 +0530 Subject: [PATCH 4/6] Removed IOException --- .../cdap/plugin/batch/source/ExcelErrorDetailsProvider.java | 4 ---- 1 file changed, 4 deletions(-) diff --git a/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelErrorDetailsProvider.java b/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelErrorDetailsProvider.java index 1da18369d..24ffb9255 100644 --- a/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelErrorDetailsProvider.java +++ b/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelErrorDetailsProvider.java @@ -66,10 +66,6 @@ public ProgramFailureException getExceptionDetails(Exception e, ErrorContext err return getProgramFailureException((IllegalArgumentException) t, errorContext, ErrorType.USER); } - if (t instanceof IOException) { - return getProgramFailureException((IOException) t, errorContext, - ErrorType.USER); - } } return null; } From f64cec89fd1f576bfc6b4689f35964c8a4912466 Mon Sep 17 00:00:00 2001 From: dheeraj-kholia-cs Date: Tue, 11 Feb 2025 15:15:27 +0530 Subject: [PATCH 5/6] Added subcategory for exceptions --- .../source/ExcelErrorDetailsProvider.java | 23 +++++++++---------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelErrorDetailsProvider.java b/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelErrorDetailsProvider.java index 24ffb9255..bf29037d3 100644 --- a/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelErrorDetailsProvider.java +++ b/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelErrorDetailsProvider.java @@ -19,20 +19,16 @@ import com.github.pjfanning.xlsx.exceptions.MissingSheetException; import com.github.pjfanning.xlsx.exceptions.ReadException; import com.google.common.base.Throwables; - import io.cdap.cdap.api.exception.ErrorCategory; import io.cdap.cdap.api.exception.ErrorType; import io.cdap.cdap.api.exception.ErrorUtils; import io.cdap.cdap.api.exception.ProgramFailureException; import io.cdap.cdap.etl.api.exception.ErrorContext; import io.cdap.cdap.etl.api.exception.ErrorDetailsProvider; - import org.apache.poi.EmptyFileException; -import java.io.IOException; -import java.util.List; - import javax.annotation.Nullable; +import java.util.List; /** * ExcelErrorDetailsProvider provider @@ -40,6 +36,9 @@ public class ExcelErrorDetailsProvider implements ErrorDetailsProvider { private static final String ERROR_MESSAGE_FORMAT = "Error occurred in the phase: '%s'. Error message: %s"; + private static final String SUBCATEGORY_CONFIGURATION = "Configuration"; + private static final String SUBCATEGORY_DATA_MISSING = "Data Missing"; + private static final String SUBCATEGORY_FILE_READ_ERROR = "File Read Error"; @Nullable @Override @@ -52,19 +51,19 @@ public ProgramFailureException getExceptionDetails(Exception e, ErrorContext err } if (t instanceof MissingSheetException) { return getProgramFailureException((MissingSheetException) t, errorContext, - ErrorType.USER); + ErrorType.USER, SUBCATEGORY_DATA_MISSING); } if (t instanceof ReadException) { return getProgramFailureException((ReadException) t, errorContext, - ErrorType.USER); + ErrorType.USER, SUBCATEGORY_FILE_READ_ERROR); } if (t instanceof EmptyFileException) { return getProgramFailureException((EmptyFileException) t, errorContext, - ErrorType.USER); + ErrorType.USER, SUBCATEGORY_DATA_MISSING); } if (t instanceof IllegalArgumentException) { return getProgramFailureException((IllegalArgumentException) t, errorContext, - ErrorType.USER); + ErrorType.USER, SUBCATEGORY_CONFIGURATION); } } return null; @@ -76,11 +75,11 @@ public ProgramFailureException getExceptionDetails(Exception e, ErrorContext err * @param exception The Exception to get the error information from. * @return A ProgramFailureException with the given error information. */ - private ProgramFailureException getProgramFailureException(Exception exception, - ErrorContext errorContext, ErrorType errorType) { + private ProgramFailureException getProgramFailureException(Exception exception, ErrorContext errorContext, + ErrorType errorType, String subCategory) { String errorMessage = exception.getMessage(); return ErrorUtils.getProgramFailureException( - new ErrorCategory(ErrorCategory.ErrorCategoryEnum.PLUGIN), errorMessage, + new ErrorCategory(ErrorCategory.ErrorCategoryEnum.PLUGIN, subCategory), errorMessage, String.format(ERROR_MESSAGE_FORMAT, errorContext.getPhase(), errorMessage), errorType, false, exception); } From 61bb7666c9b1f0e5a85de88d1288abc406f36011 Mon Sep 17 00:00:00 2001 From: dheeraj-kholia-cs Date: Tue, 11 Feb 2025 16:28:16 +0530 Subject: [PATCH 6/6] Checkstyle fix --- .../io/cdap/plugin/batch/source/ExcelErrorDetailsProvider.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelErrorDetailsProvider.java b/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelErrorDetailsProvider.java index bf29037d3..e73529e2d 100644 --- a/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelErrorDetailsProvider.java +++ b/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelErrorDetailsProvider.java @@ -27,8 +27,8 @@ import io.cdap.cdap.etl.api.exception.ErrorDetailsProvider; import org.apache.poi.EmptyFileException; -import javax.annotation.Nullable; import java.util.List; +import javax.annotation.Nullable; /** * ExcelErrorDetailsProvider provider