pom.xml

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -7,17 +7,6 @@
  
    	<version>1.0.0</version>

    	<packaging>jar</packaging>

    	<distributionManagement>

    		<snapshotRepository>

    			<id>ossrh</id>

    			<url>https://oss.sonatype.org/content/repositories/snapshots</url>

    		</snapshotRepository>

    		<repository>

    			<id>ossrh</id>

    			<url>https://oss.sonatype.org/service/local/staging/deploy/maven2/</url>

    		</repository>

    	</distributionManagement>

    	<name>${project.groupId}:${project.artifactId}</name>

    	<description>An application designed to be used as a command line utility for compacting files using the Spark framework.</description>

    	<url>http://maven.apache.org</url>

    @@ -122,6 +111,9 @@
  
    			<plugin>

    				<groupId>org.apache.maven.plugins</groupId>

    				<artifactId>maven-gpg-plugin</artifactId>

    				<configuration>

    					<skip>true</skip>

    				</configuration>

    				<version>1.5</version>

    				<executions>

    					<execution>

    @@ -140,17 +132,17 @@
  
    		<dependency>

    			<groupId>org.apache.spark</groupId>

    			<artifactId>spark-core_2.10</artifactId>

    			<version>1.5.2</version>

    			<version>2.2.0</version>

    		</dependency>

    		<dependency>

    			<groupId>org.apache.spark</groupId>

    			<artifactId>spark-sql_2.10</artifactId>

    			<version>1.5.2</version>

    			<version>2.2.0</version>

    		</dependency>

    		<dependency>

    			<groupId>org.apache.hadoop</groupId>

    			<artifactId>hadoop-hdfs</artifactId>

    			<version>2.6.4</version>

    			<version>2.7.2</version>

    		</dependency>

    		<dependency>

    			<groupId>commons-cli</groupId>

    @@ -172,4 +164,4 @@
  
    		</repository>

    	</repositories>

    </project>

    </project>

src/main/java/com/github/KeithSSmith/spark_compaction/Compact.java

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -17,7 +17,7 @@
  
    import org.apache.spark.SparkConf;

    import org.apache.spark.api.java.JavaRDD;

    import org.apache.spark.api.java.JavaSparkContext;

    import org.apache.spark.sql.DataFrame;

    import org.apache.spark.sql.Dataset;

    import org.apache.spark.sql.SQLContext;

    public class Compact {

    @@ -180,13 +180,13 @@ public void compact(String inputPath, String outputPath) throws IOException {
  
                textFile.coalesce(this.splitSize).saveAsTextFile(outputPath);

            } else if (this.outputSerialization.equals(PARQUET)) {

                SQLContext sqlContext = new SQLContext(sc);

                DataFrame parquetFile = sqlContext.read().parquet(this.concatInputPath(inputPath));

                Dataset parquetFile = sqlContext.read().parquet(this.concatInputPath(inputPath));

                parquetFile.coalesce(this.splitSize).write().parquet(outputPath);

            } else if (this.outputSerialization.equals(AVRO)) {

                // For this to work the files must end in .avro

            	// Another issue is that when using compression the compression codec extension is not being added to the file name.

                SQLContext sqlContext = new SQLContext(sc);

                DataFrame avroFile = sqlContext.read().format("com.databricks.spark.avro").load(this.concatInputPath(inputPath));

                Dataset avroFile = sqlContext.read().format("com.databricks.spark.avro").load(this.concatInputPath(inputPath));

                avroFile.coalesce(this.splitSize).write().format("com.databricks.spark.avro").save(outputPath);

            } else {

                System.out.println("Did not match any serialization type: text, parquet, or avro.  Recieved: " +

    @@ -207,12 +207,12 @@ public void compact(String[] args) throws IOException {
  
                textFile.coalesce(this.splitSize).saveAsTextFile(outputPath);

            } else if (this.outputSerialization.equals(PARQUET)) {

                SQLContext sqlContext = new SQLContext(sc);

                DataFrame parquetFile = sqlContext.read().parquet(this.concatInputPath(inputPath));

                Dataset parquetFile = sqlContext.read().parquet(this.concatInputPath(inputPath));

                parquetFile.coalesce(this.splitSize).write().parquet(outputPath);

            } else if (this.outputSerialization.equals(AVRO)) {

                // For this to work the files must end in .avro

                SQLContext sqlContext = new SQLContext(sc);

                DataFrame avroFile = sqlContext.read().format("com.databricks.spark.avro").load(this.concatInputPath(inputPath));

                Dataset avroFile = sqlContext.read().format("com.databricks.spark.avro").load(this.concatInputPath(inputPath));

                avroFile.coalesce(this.splitSize).write().format("com.databricks.spark.avro").save(outputPath);

            } else {

                System.out.println("Did not match any serialization type: text, parquet, or avro.  Recieved: " +

Compatibility changes for Spark2.2 #3

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open

abmeena wants to merge 1 commit into KeithSSmith:master from abmeena:master

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Compatibility changes for Spark2.2 #3

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Compatibility changes for Spark2.2 #3

Are you sure you want to change the base?

Uh oh!

Compatibility changes for Spark2.2 #3

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing