-
Notifications
You must be signed in to change notification settings - Fork 69
Path exclusion/inclusion filtering #105
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,13 +23,14 @@ | |
| import org.apache.hadoop.fs.FileStatus; | ||
| import org.apache.hadoop.fs.FileSystem; | ||
| import org.apache.hadoop.fs.Path; | ||
| import org.apache.hadoop.mapred.JobHistory; | ||
|
|
||
| /** | ||
| * Pathfilter that allows only files that are named correctly and are modified | ||
| * within a certain time range. | ||
| * | ||
| */ | ||
| public class JobFileModifiedRangePathFilter extends JobFilePathFilter { | ||
| public class JobFileModifiedRangeSubstringPathFilter extends JobFilePathFilter { | ||
|
|
||
| /** | ||
| * The minimum modification time of a file to be accepted in milliseconds | ||
|
|
@@ -47,7 +48,12 @@ public class JobFileModifiedRangePathFilter extends JobFilePathFilter { | |
| * The configuration of this processing job (not the files we are processing). | ||
| */ | ||
| private final Configuration myConf; | ||
| private static Log LOG = LogFactory.getLog(JobFileModifiedRangePathFilter.class); | ||
|
|
||
| private String[] pathExclusionFilter; | ||
|
|
||
| private String[] pathInclusionFilter; | ||
|
|
||
| private static Log LOG = LogFactory.getLog(JobFileModifiedRangeSubstringPathFilter.class); | ||
|
|
||
| /** | ||
| * Constructs a filter that accepts only JobFiles with lastModification time | ||
|
|
@@ -62,11 +68,9 @@ public class JobFileModifiedRangePathFilter extends JobFilePathFilter { | |
| * maximum modification time of a file to be accepted in milliseconds | ||
| * since January 1, 1970 UTC (including). | ||
| */ | ||
| public JobFileModifiedRangePathFilter(Configuration myConf, | ||
| public JobFileModifiedRangeSubstringPathFilter(Configuration myConf, | ||
| long minModificationTimeMillis, long maxModificationTimeMillis) { | ||
| this.myConf = myConf; | ||
| this.minModificationTimeMillis = minModificationTimeMillis; | ||
| this.maxModificationTimeMillis = maxModificationTimeMillis; | ||
| this(myConf, minModificationTimeMillis, maxModificationTimeMillis, null, null); | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -79,11 +83,35 @@ public JobFileModifiedRangePathFilter(Configuration myConf, | |
| * The minimum modification time of a file to be accepted in | ||
| * milliseconds since January 1, 1970 UTC (excluding). | ||
| */ | ||
| public JobFileModifiedRangePathFilter(Configuration myConf, | ||
| public JobFileModifiedRangeSubstringPathFilter(Configuration myConf, | ||
| long minModificationTimeMillis) { | ||
| this(myConf, minModificationTimeMillis, Long.MAX_VALUE); | ||
| } | ||
|
|
||
| /** | ||
| * Constructs a filter that accepts only JobFiles with lastModification time | ||
| * as least the specified minumum. Also accepts a simple substring to exclude. | ||
| * | ||
| * @param myConf | ||
| * used to be able to go from a path to a FileStatus. | ||
| * @param minModificationTimeMillis | ||
| * The minimum modification time of a file to be accepted in | ||
| * milliseconds since January 1, 1970 UTC (excluding). | ||
| * @param maxModificationTimeMillis The | ||
| * maximum modification time of a file to be accepted in milliseconds | ||
| * since January 1, 1970 UTC (including). | ||
| * @param pathExclusionFilter | ||
| * files with this substring path will be excluded | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You are missing javadoc for pathInclusionFilter argument. |
||
| */ | ||
| public JobFileModifiedRangeSubstringPathFilter(Configuration myConf, long minModificationTimeMillis, long maxModificationTimeMillis, | ||
| String[] pathExclusionFilter, String[] pathInclusionFilter) { | ||
| this.myConf = myConf; | ||
| this.minModificationTimeMillis = minModificationTimeMillis; | ||
| this.maxModificationTimeMillis = maxModificationTimeMillis; | ||
| this.pathExclusionFilter = pathExclusionFilter; | ||
| this.pathInclusionFilter = pathInclusionFilter; | ||
| } | ||
|
|
||
| /* | ||
| * (non-Javadoc) | ||
| * | ||
|
|
@@ -96,25 +124,53 @@ public boolean accept(Path path) { | |
| if (!super.accept(path)) { | ||
| return false; | ||
| } | ||
|
|
||
| JobFile jobFile = new JobFile(path.getName()); | ||
| if (jobFile.isJobConfFile() || jobFile.isJobHistoryFile()) { | ||
| if (jobFile.isJobHistoryFile()) { | ||
| if (!includesPathSubstrings(path) || !excludesPathSubstrings(path)) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The double negative with an or is really hard to read here, then the methods have indexOf != with false and true return statements. |
||
| return false; | ||
| } | ||
| } | ||
| try { | ||
| FileSystem fs = path.getFileSystem(myConf); | ||
| FileStatus fileStatus = fs.getFileStatus(path); | ||
| long fileModificationTimeMillis = fileStatus.getModificationTime(); | ||
| return accept(fileModificationTimeMillis); | ||
| } catch (IOException e) { | ||
| throw new ImportException("Cannot determine file modification time of " | ||
| + path.getName(), e); | ||
| throw new ImportException("Cannot determine file modification time of " + path.getName(), e); | ||
| } | ||
| } else { | ||
| // Reject anything that does not match a job conf filename. | ||
| LOG.info(" Not a valid job conf / job history file "+ path.getName()); | ||
| LOG.info(" Not a valid job conf / job history file " + path.getName()); | ||
| return false; | ||
| } | ||
| } | ||
|
|
||
| private boolean excludesPathSubstrings(Path path) { | ||
| if (pathExclusionFilter == null) | ||
| return true; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You need curly braces around the then block for ifs. |
||
|
|
||
| for (String s: pathExclusionFilter) { | ||
| if (path.toString().indexOf(s) != -1) | ||
| return false; | ||
| } | ||
|
|
||
| return true; | ||
| } | ||
|
|
||
| private boolean includesPathSubstrings(Path path) { | ||
| if (pathInclusionFilter == null) | ||
| return true; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. curly braces |
||
| boolean matches = false; | ||
| for (String s: pathInclusionFilter) { | ||
| if (path.toString().indexOf(s) != -1) | ||
| matches = true; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. curlies |
||
| } | ||
|
|
||
| return matches; | ||
| } | ||
|
|
||
| /** | ||
| * @param fileModificationTimeMillis | ||
| * in milliseconds since January 1, 1970 UTC | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
these looks like some specific internal users you run in your environment.
We may have to leave this out of the generic setup and you may have to add that in your internal deployment yourself.
Alternatively we can see if this can be picked up from hrave-conf.xml so that the shell script and the config are separated.