Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 14 additions & 6 deletions core-plugins/docs/File-batchsource.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@ Properties
**Path:** Path to read from. For example, s3a://<bucket>/path/to/input

**Format:** Format of the data to read.
The format must be one of 'avro', 'blob', 'csv', 'delimited', 'json', 'parquet', 'text', 'tsv', or the
The format must be one of 'avro', 'blob', 'csv', 'delimited', 'json', 'parquet', 'text', 'tsv', 'xls', or the
name of any format plugin that you have deployed to your environment.
If the format is a macro, only the pre-packaged formats can be used.
If the format is 'blob', every input file will be read into a separate record.
The 'blob' format also requires a schema that contains a field named 'body' of type 'bytes'.
If the format is 'text', the schema must contain a field named 'body' of type 'string'.

**Get Schema:** Auto-detects schema from file. Supported formats are: avro, parquet, csv, delimited, tsv, blob
**Get Schema:** Auto-detects schema from file. Supported formats are: avro, parquet, csv, delimited, tsv, blob, xls
and text.

Blob - is set by default as field named 'body' of type bytes.
Expand All @@ -37,14 +37,22 @@ If no such file can be found, an error will be returned.
Avro - If the path is a directory, the plugin will look for files ending in '.avro' to read the schema from.
If no such file can be found, an error will be returned.

**Sample Size:** The maximum number of rows that will get investigated for automatic data type detection.
The default value is 1000. This is only used when the format is 'xls'.

**Override:** A list of columns with the corresponding data types for whom the automatic data type detection gets
skipped.

**Sample Size:** The maximum number of rows in a file that will get investigated for automatic data type detection.
skipped. This is only used when the format is 'xls'.

**Terminate Reading After Empty Row:** Specify whether to stop reading after encountering the first empty row. Defaults to false. When false the reader will read all rows in the sheet. This is only used when the format is 'xls'.

**Select Sheet Using:** Select the sheet by name or number. Default is 'Sheet Number'. This is only used when the format is 'xls'.

**Sheet Value:** The name/number of the sheet to read from. If not specified, the first sheet will be read.
Sheet Numbers are 0 based, ie first sheet is 0. This is only used when the format is 'xls'.

**Delimiter:** Delimiter to use when the format is 'delimited'. This will be ignored for other formats.

**Use First Row as Header:** Whether to use the first line of each file as the column headers. Supported formats are 'text', 'csv', 'tsv', 'delimited'.
**Use First Row as Header:** Whether to use the first line of each file as the column headers. Supported formats are 'text', 'csv', 'tsv', 'xls', 'delimited'.

**Enable Quoted Values** Whether to treat content between quotes as a value. This value will only be used if the format
is 'csv', 'tsv' or 'delimited'. For example, if this is set to true, a line that looks like `1, "a, b, c"` will output two fields.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
/*
* Copyright © 2025 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/

package io.cdap.plugin.batch.action;

import io.cdap.plugin.batch.source.FileErrorDetailsProvider;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;

import java.io.IOException;
import javax.annotation.Nullable;

/**
* Utility class for common file action operations.
*/
public class FileActionUtils {

static FileStatus[] getFileStatuses(FileSystem fileSystem, Path path, @Nullable PathFilter filter) {
try {
if (filter == null) {
return fileSystem.listStatus(path);
}
return fileSystem.listStatus(path, filter);
} catch (IOException e) {
String errorReason = String.format("Failed to list files in %s.", path);
throw FileErrorDetailsProvider.getFileBasedProgramFailureExceptionDetailsFromChain(e, errorReason);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,16 @@
import io.cdap.cdap.api.annotation.Macro;
import io.cdap.cdap.api.annotation.Name;
import io.cdap.cdap.api.annotation.Plugin;
import io.cdap.cdap.api.exception.ErrorCategory;
import io.cdap.cdap.api.exception.ErrorType;
import io.cdap.cdap.api.exception.ErrorUtils;
import io.cdap.cdap.api.plugin.PluginConfig;
import io.cdap.cdap.etl.api.FailureCollector;
import io.cdap.cdap.etl.api.PipelineConfigurer;
import io.cdap.cdap.etl.api.StageConfigurer;
import io.cdap.cdap.etl.api.action.Action;
import io.cdap.cdap.etl.api.action.ActionContext;
import io.cdap.plugin.batch.source.FileErrorDetailsProvider;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
Expand Down Expand Up @@ -66,7 +70,13 @@ public void configurePipeline(PipelineConfigurer pipelineConfigurer) {
public void run(ActionContext context) throws Exception {
Path path = new Path(config.path);

FileSystem fileSystem = path.getFileSystem(new Configuration());
FileSystem fileSystem;
try {
fileSystem = path.getFileSystem(new Configuration());
} catch (IOException e) {
String errorReason = String.format("Failed to get FileSystem for path %s.", path);
throw FileErrorDetailsProvider.getFileBasedProgramFailureExceptionDetailsFromChain(e, errorReason);
}

FileStatus[] listFiles;
if (config.fileRegex != null) {
Expand All @@ -78,18 +88,24 @@ public boolean accept(Path path) {
return pattern.matcher(path.getName()).matches();
}
};
listFiles = fileSystem.listStatus(path, filter);
listFiles = FileActionUtils.getFileStatuses(fileSystem, path, filter);
} else {
listFiles = fileSystem.listStatus(path);
listFiles = FileActionUtils.getFileStatuses(fileSystem, path, null);
}

for (FileStatus file : listFiles) {
Path currPath = file.getPath();
removePath(fileSystem, currPath);
}


if (fileSystem.isDirectory(path) && config.fileRegex == null) {
boolean isDirectory = false;
try {
isDirectory = fileSystem.isDirectory(path);
} catch (IOException e) {
String errorReason = String.format("Failed to check if %s is a directory.", path);
throw FileErrorDetailsProvider.getFileBasedProgramFailureExceptionDetailsFromChain(e, errorReason);
}
if (isDirectory && config.fileRegex == null) {
removePath(fileSystem, path);
}

Expand All @@ -99,13 +115,16 @@ public void removePath(FileSystem fileSystem, Path currPath) throws Exception {
try {
if (!fileSystem.delete(currPath, true)) {
if (!config.continueOnError) {
throw new IOException(String.format("Removal of %s was unsuccessful.", currPath.toString()));
String error = String.format("Removal of %s was unsuccessful.", currPath.toString());
throw ErrorUtils.getProgramFailureException(new ErrorCategory(ErrorCategory.ErrorCategoryEnum.PLUGIN),
error, error, ErrorType.USER, false, null);
}
LOG.warn("Removal of {} was unsuccessful.", currPath.toString());
}
} catch (IOException e) {
if (!config.continueOnError) {
throw e;
String errorReason = String.format("Removal of %s was unsuccessful.", currPath.toString());
throw FileErrorDetailsProvider.getFileBasedProgramFailureExceptionDetailsFromChain(e, errorReason);
}
LOG.warn("Removal of {} was unsuccessful.", currPath.toString());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,16 @@
import io.cdap.cdap.api.annotation.Macro;
import io.cdap.cdap.api.annotation.Name;
import io.cdap.cdap.api.annotation.Plugin;
import io.cdap.cdap.api.exception.ErrorCategory;
import io.cdap.cdap.api.exception.ErrorType;
import io.cdap.cdap.api.exception.ErrorUtils;
import io.cdap.cdap.api.plugin.PluginConfig;
import io.cdap.cdap.etl.api.FailureCollector;
import io.cdap.cdap.etl.api.PipelineConfigurer;
import io.cdap.cdap.etl.api.StageConfigurer;
import io.cdap.cdap.etl.api.action.Action;
import io.cdap.cdap.etl.api.action.ActionContext;
import io.cdap.plugin.batch.source.FileErrorDetailsProvider;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
Expand Down Expand Up @@ -69,21 +73,42 @@ public void run(ActionContext context) throws Exception {

Path dest = new Path(config.destPath);

FileSystem fileSystem = source.getFileSystem(new Configuration());
fileSystem.mkdirs(dest.getParent());
FileSystem fileSystem;
try {
fileSystem = source.getFileSystem(new Configuration());
} catch (IOException e) {
String errorReason = String.format("Failed to get file system for source path %s", source);
throw FileErrorDetailsProvider.getFileBasedProgramFailureExceptionDetailsFromChain(e, errorReason);
}
try {
fileSystem.mkdirs(dest.getParent());
} catch (IOException e) {
String errorReason = String.format("Failed to create parent directory for dest path %s", dest);
throw FileErrorDetailsProvider.getFileBasedProgramFailureExceptionDetailsFromChain(e, errorReason);
}
FileStatus fileStatus;
try {
fileStatus = fileSystem.getFileStatus(source);
} catch (IOException e) {
String errorReason = String.format("Failed to get file status for source path %s", source);
throw FileErrorDetailsProvider.getFileBasedProgramFailureExceptionDetailsFromChain(e, errorReason);
}

if (fileSystem.getFileStatus(source).isFile()) { //moving single file
if (fileStatus != null && fileStatus.isFile()) { //moving single file

try {
if (!fileSystem.rename(source, dest)) {
if (!config.continueOnError) {
throw new IOException(String.format("Failed to rename file %s to %s", source, dest));
String error = String.format("Failed to move file %s to %s", source, dest);
throw ErrorUtils.getProgramFailureException(new ErrorCategory(ErrorCategory.ErrorCategoryEnum.PLUGIN),
error, error, ErrorType.USER, false, null);
}
LOG.error("Failed to move file {} to {}", source, dest);
}
} catch (IOException e) {
if (!config.continueOnError) {
throw e;
String errorReason = String.format("Failed to move file %s to %s", source, dest);
throw FileErrorDetailsProvider.getFileBasedProgramFailureExceptionDetailsFromChain(e, errorReason);
}
LOG.error("Failed to move file {} to {}", source, dest, e);
}
Expand All @@ -102,38 +127,48 @@ public boolean accept(Path path) {
}
};

listFiles = fileSystem.listStatus(source, filter);
listFiles = FileActionUtils.getFileStatuses(fileSystem, source, filter);
} else {
listFiles = fileSystem.listStatus(source);
listFiles = FileActionUtils.getFileStatuses(fileSystem, source, null);
}

if (listFiles.length == 0) {
if (config.fileRegex != null) {
LOG.warn("Not moving any files of type {} from source {}", config.fileRegex, source.toString());
LOG.warn("Not moving any files of type {} from source {}", config.fileRegex, source);
} else {
LOG.warn("Not moving any files from source {}", source.toString());
LOG.warn("Not moving any files from source {}", source);
}
}

if (fileSystem.isFile(dest)) {
throw new IllegalArgumentException(String.format("destPath %s needs to be a directory since sourcePath is a " +
"directory", config.destPath));
try {
if (fileSystem.isFile(dest)) {
String error = String.format("destPath %s needs to be a directory since sourcePath is a directory",
config.destPath);
throw ErrorUtils.getProgramFailureException(new ErrorCategory(ErrorCategory.ErrorCategoryEnum.PLUGIN), error,
error, ErrorType.USER, false, null);
}
fileSystem.mkdirs(dest); // create destination directory if necessary
} catch (IOException e) {
String errorReason = String.format("Failed to create destination directory %s", dest);
throw FileErrorDetailsProvider.getFileBasedProgramFailureExceptionDetailsFromChain(e, errorReason);
}
fileSystem.mkdirs(dest); //create destination directory if necessary

for (FileStatus file : listFiles) {
source = file.getPath();

try {
if (!fileSystem.rename(source, dest)) {
if (!config.continueOnError) {
throw new IOException(String.format("Failed to rename file %s to %s", source, dest));
String error = String.format("Failed to rename file %s to %s", source, dest);
throw ErrorUtils.getProgramFailureException(new ErrorCategory(ErrorCategory.ErrorCategoryEnum.PLUGIN),
error, error, ErrorType.USER, false, null);
}
LOG.error("Failed to move file {} to {}", source, dest);
}
} catch (IOException e) {
if (!config.continueOnError) {
throw e;
String errorReason = String.format("Failed to rename file %s to %s", source, dest);
throw FileErrorDetailsProvider.getFileBasedProgramFailureExceptionDetailsFromChain(e, errorReason);
}
LOG.error("Failed to move file {} to {}", source, dest, e);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import com.google.gson.reflect.TypeToken;
import io.cdap.cdap.api.annotation.Description;
import io.cdap.cdap.api.annotation.Macro;
import io.cdap.cdap.api.annotation.Name;
import io.cdap.cdap.api.data.schema.Schema;
import io.cdap.cdap.etl.api.FailureCollector;
import io.cdap.plugin.format.FileFormat;
Expand All @@ -37,6 +38,9 @@ public class FileSourceConfig extends AbstractFileSourceConfig {
public static final String NAME_FILE_SYSTEM_PROPERTIES = "fileSystemProperties";
public static final String NAME_PATH = "path";
public static final String NAME_FILE_ENCODING = "fileEncoding";
public static final String NAME_SHEET = "sheet";
public static final String NAME_SHEET_VALUE = "sheetValue";
public static final String NAME_TERMINATE_IF_EMPTY_ROW = "terminateIfEmptyRow";

private static final Gson GSON = new Gson();
private static final Type MAP_STRING_STRING_TYPE = new TypeToken<Map<String, String>>() { }.getType();
Expand Down Expand Up @@ -64,6 +68,25 @@ public class FileSourceConfig extends AbstractFileSourceConfig {
@Nullable
@Description("The maximum number of rows that will get investigated for automatic data type detection.")
private Long sampleSize;

@Name(NAME_SHEET)
@Macro
@Nullable
@Description("Select the sheet by name or number. Default is 'Sheet Number'.")
private String sheet;

@Name(NAME_SHEET_VALUE)
@Macro
@Nullable
@Description("The name/number of the sheet to read from. If not specified, the first sheet will be read." +
"Sheet Numbers are 0 based, ie first sheet is 0.")
private String sheetValue;

@Name(NAME_TERMINATE_IF_EMPTY_ROW)
@Macro
@Nullable
@Description("Specify whether to stop reading after encountering the first empty row. Defaults to false.")
private String terminateIfEmptyRow;

FileSourceConfig() {
super();
Expand Down
Loading