From d5a85f44df7ec42fd483b1a60c31cf7e8ed40d2f Mon Sep 17 00:00:00 2001 From: Alex Dean Date: Sun, 7 Apr 2013 01:55:17 +0100 Subject: [PATCH 1/4] Changed setFilesystem to work with Amazon EMR/S3 paths as well --- src/main/java/com/m6d/filecrush/crush/Crush.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/com/m6d/filecrush/crush/Crush.java b/src/main/java/com/m6d/filecrush/crush/Crush.java index 96f5072..70ffd83 100644 --- a/src/main/java/com/m6d/filecrush/crush/Crush.java +++ b/src/main/java/com/m6d/filecrush/crush/Crush.java @@ -574,7 +574,7 @@ public int run(String[] args) throws Exception { return 0; } - setFileSystem(FileSystem.get(job)); + setFileSystem(srcDir.getFileSystem(job)); FileStatus status = fs.getFileStatus(srcDir); From 51737369bf83bb227ad47fb1740ebbc9042d7320 Mon Sep 17 00:00:00 2001 From: Alex Dean Date: Sun, 7 Apr 2013 01:55:42 +0100 Subject: [PATCH 2/4] Fixed --input-format and --output-format CLI options --- README | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/README b/README index 862f0b0..8879e14 100644 --- a/README +++ b/README @@ -159,8 +159,8 @@ Now we try an example using the directory options. Say we invoke the crush like Crush \ --regex=.*/(.+) \ --replacement=$1-${crush.timestamp}-${crush.task.num}-${crush.file.num} \ - --input=sequence \ - --output=sequence \ + --input-format=sequence \ + --output-format=sequence \ /user/example/work/input /user/example/work/output 20100221175612 The --regex and --replacement arguments are similar to the arguments passed to String.replaceAll(). The regex argument matches the final part of a directory path. For /user/example/work/input, it will match input. For /user/example/work/input/subdir, it will match subdir. For matching purposes, a directory path does not have a trailing slash. The replacement argument refers to the match group by number to rename the file. The result is: @@ -179,8 +179,8 @@ The following invocation fails: Crush \ --regex=.*/input \ --replacement=input-${crush.timestamp}-${crush.task.num}-${crush.file.num} \ - --input=sequence \ - --output=sequence \ + --input-format=sequence \ + --output-format=sequence \ /user/example/work/input /user/example/work/output 20100221175612 Since we have specified some directory options, we must ensure that all directories in hierarchy rooted at the input argument have a matching regex (since the default regex is no longer applicable). In this invocation, there is no regex argument that matches /user/example/work/input/subdir. We must change it to: @@ -188,12 +188,12 @@ Since we have specified some directory options, we must ensure that all director Crush \ --regex=.*/input \ --replacement=input-${crush.timestamp}-${crush.task.num}-${crush.file.num} \ - --input=sequence \ - --output=sequence \ + --input-format=sequence \ + --output-format=sequence \ --regex=.*/subdir \ --replacement=as-text-${crush.timestamp}-${crush.task.num}-${crush.file.num} \ - --input=sequence \ - --output=text \ + --input-format=sequence \ + --output-format=text \ /user/example/work/input /user/example/work/output 20100221175612 This will yield: From f638179f32674dbff7bd61ec0e9726e9ec3bcec3 Mon Sep 17 00:00:00 2001 From: Alex Dean Date: Sun, 7 Apr 2013 02:36:27 +0100 Subject: [PATCH 3/4] Version bump --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index d414b55..31601c9 100644 --- a/pom.xml +++ b/pom.xml @@ -4,7 +4,7 @@ com.m6d filecrush M6D App - Filecrush - 2.2.2-SNAPSHOT + 2.2.3-SNAPSHOT filecrush utility jar From 78d2bd7df5ea81518da17567746f8b15ea2987e1 Mon Sep 17 00:00:00 2001 From: Alex Dean Date: Sun, 7 Apr 2013 02:42:15 +0100 Subject: [PATCH 4/4] Fixed the filesystem lookups in the other files --- src/main/java/com/m6d/filecrush/clean/Clean.java | 6 ++++-- .../java/com/m6d/filecrush/crush/CountersInputFormat.java | 2 +- src/main/java/com/m6d/filecrush/crush/CrushPartitioner.java | 5 +++-- src/main/java/com/m6d/filecrush/crush/CrushReducer.java | 5 +++-- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/main/java/com/m6d/filecrush/clean/Clean.java b/src/main/java/com/m6d/filecrush/clean/Clean.java index d68a518..234dfd9 100644 --- a/src/main/java/com/m6d/filecrush/clean/Clean.java +++ b/src/main/java/com/m6d/filecrush/clean/Clean.java @@ -51,8 +51,10 @@ public static void main(String[] args) throws Exception { public int run(String[] args) throws Exception { conf = getConf(); + Path targetDir = new Path(conf.get(TARGET_DIR)); + try { - fs=FileSystem.get(getConf()); + fs = targetDir.getFileSystem(conf); } catch (IOException e) { throw new RuntimeException("Could not open filesystem"); } @@ -67,7 +69,7 @@ public int run(String[] args) throws Exception { cutoff=now-targetAge; } - return cleanup (new Path(conf.get(TARGET_DIR))); + return cleanup(targetDir); } diff --git a/src/main/java/com/m6d/filecrush/crush/CountersInputFormat.java b/src/main/java/com/m6d/filecrush/crush/CountersInputFormat.java index 8ca5512..7097be7 100644 --- a/src/main/java/com/m6d/filecrush/crush/CountersInputFormat.java +++ b/src/main/java/com/m6d/filecrush/crush/CountersInputFormat.java @@ -50,7 +50,7 @@ public RecordReader getRecordReader(InputSplit inputSpli Path path = fSplit.getPath(); long length = fSplit.getLength(); - FileSystem fs = FileSystem.get(jobconf); + FileSystem fs = path.getFileSystem(jobconf); FSDataInputStream is = fs.open(path); diff --git a/src/main/java/com/m6d/filecrush/crush/CrushPartitioner.java b/src/main/java/com/m6d/filecrush/crush/CrushPartitioner.java index a65573f..57aa595 100644 --- a/src/main/java/com/m6d/filecrush/crush/CrushPartitioner.java +++ b/src/main/java/com/m6d/filecrush/crush/CrushPartitioner.java @@ -41,9 +41,10 @@ public void configure(JobConf job) { bucketToPartition = new HashMap(100); try { - FileSystem fs = FileSystem.get(job); + Path p = new Path(path); + FileSystem fs = p.getFileSystem(job); - Reader reader = new Reader(fs, new Path(path), job); + Reader reader = new Reader(fs, p, job); Text bucket = new Text(); IntWritable partNum = new IntWritable(); diff --git a/src/main/java/com/m6d/filecrush/crush/CrushReducer.java b/src/main/java/com/m6d/filecrush/crush/CrushReducer.java index 22b2f76..f9cbd97 100644 --- a/src/main/java/com/m6d/filecrush/crush/CrushReducer.java +++ b/src/main/java/com/m6d/filecrush/crush/CrushReducer.java @@ -127,7 +127,8 @@ public void configure(JobConf job) { * The files we write should be rooted in the "crush" subdir of the output directory to distinguish them from the files * created by the collector. */ - outDirPath = new Path(outDirPath + "/crush").toUri().getPath(); + Path outDirP = new Path(outDirPath + "/crush"); + outDirPath = outDirP.toUri().getPath(); /* * Configure the regular expressions and replacements we use to convert dir names to crush output file names. Also get the @@ -145,7 +146,7 @@ public void configure(JobConf job) { placeHolderToValue.put("crush.timestamp", job.get("crush.timestamp")); try { - fs = FileSystem.get(job); + fs = outDirP.getFileSystem(job); } catch (RuntimeException e) { throw e; } catch (Exception e) {