diff --git a/.gitignore b/.gitignore index 76efab50ebb3..6aab507c580a 100644 --- a/.gitignore +++ b/.gitignore @@ -35,3 +35,5 @@ dependency-reduced-pom.xml apache-hive-3.1.3-bin-gdp-*.tar.gz hive_build_dist/ .devcontainer/.devpod-internal/ + +.qodo diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index d055288e7369..f57d9f058fbd 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -637,6 +637,9 @@ public static enum ConfVars { // Metastore stuff. Be sure to update HiveConf.metaVars when you add something here! METASTOREDBTYPE("hive.metastore.db.type", "DERBY", new StringSet("DERBY", "ORACLE", "MYSQL", "MSSQL", "POSTGRES"), "Type of database used by the metastore. Information schema & JDBCStorageHandler depend on it."), + METASTORE_CLIENT_FACTORY_CLASS("hive.metastore.client.factory.class", + "org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClientFactory", + "The name of the factory class that produces objects implementing the IMetaStoreClient interface."), /** * @deprecated Use MetastoreConf.WAREHOUSE */ @@ -4420,7 +4423,13 @@ public static enum ConfVars { "This parameter enables a number of optimizations when running on blobstores:\n" + "(1) If hive.blobstore.use.blobstore.as.scratchdir is false, force the last Hive job to write to the blobstore.\n" + "This is a performance optimization that forces the final FileSinkOperator to write to the blobstore.\n" + - "See HIVE-15121 for details."); + "See HIVE-15121 for details."), + + HIVE_BLOBSTORE_USE_OUTPUTCOMMITTER("hive.blobstore.use.output-committer", false, "Whether to " + + "use a custom PathOutputCommitter to commit data. For all the URIs specified in " + + "hive.blobstore.supported.schemes, Hive will honor the config " + + "mapreduce.outputcommitter.factory.scheme.[uri-scheme]. This overrides the behavior " + + "described in hive.blobstore.optimizations.enabled. See HIVE-16295 for details."); public final String varname; public final String altName; diff --git a/common/src/java/org/apache/hadoop/hive/ql/log/PerfLogger.java b/common/src/java/org/apache/hadoop/hive/ql/log/PerfLogger.java index 764a832e2811..addfd58ef39f 100644 --- a/common/src/java/org/apache/hadoop/hive/ql/log/PerfLogger.java +++ b/common/src/java/org/apache/hadoop/hive/ql/log/PerfLogger.java @@ -72,6 +72,7 @@ public class PerfLogger { public static final String LOAD_HASHTABLE = "LoadHashtable"; public static final String TEZ_GET_SESSION = "TezGetSession"; public static final String SAVE_TO_RESULTS_CACHE = "saveToResultsCache"; + public static final String FILE_MOVES = "FileMoves"; public static final String SPARK_SUBMIT_TO_RUNNING = "SparkSubmitToRunning"; public static final String SPARK_BUILD_PLAN = "SparkBuildPlan"; diff --git a/hcatalog/core/pom.xml b/hcatalog/core/pom.xml index fe4e152d362e..7433cadaef7c 100644 --- a/hcatalog/core/pom.xml +++ b/hcatalog/core/pom.xml @@ -100,6 +100,21 @@ hadoop-annotations ${hadoop.version} + + + com.amazonaws + aws-java-sdk-core + 1.12.472 + provided + + + software.amazon.awssdk + bundle + 2.20.109 + compile + org.apache.hadoop hadoop-archives @@ -256,6 +271,12 @@ + + org.apache.tez + tez-mapreduce + 0.9.1 + compile + diff --git a/hcatalog/core/src/main/java/org/apache/hive/hcatalog/common/HCatUtil.java b/hcatalog/core/src/main/java/org/apache/hive/hcatalog/common/HCatUtil.java index 8e72a1275a5c..c288a032b8e3 100644 --- a/hcatalog/core/src/main/java/org/apache/hive/hcatalog/common/HCatUtil.java +++ b/hcatalog/core/src/main/java/org/apache/hive/hcatalog/common/HCatUtil.java @@ -41,6 +41,7 @@ import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsAction; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.HiveMetaStoreClient; @@ -757,4 +758,13 @@ public static void assertNotNull(Object t, String msg, Logger logger) { throw new IllegalArgumentException(msg); } } + + public static boolean isS3(Path path) { + if (path.toString().startsWith("s3")) { + return true; + } + else { + return false; + } + } } diff --git a/hcatalog/core/src/main/java/org/apache/hive/hcatalog/common/HiveClientCache.java b/hcatalog/core/src/main/java/org/apache/hive/hcatalog/common/HiveClientCache.java index e18dae983b56..3babb48de96d 100644 --- a/hcatalog/core/src/main/java/org/apache/hive/hcatalog/common/HiveClientCache.java +++ b/hcatalog/core/src/main/java/org/apache/hive/hcatalog/common/HiveClientCache.java @@ -18,8 +18,15 @@ */ package org.apache.hive.hcatalog.common; +import java.io.Closeable; import java.io.IOException; +import java.lang.reflect.InvocationHandler; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.lang.reflect.Proxy; +import java.util.Arrays; import java.util.concurrent.Callable; +import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.ExecutionException; import java.util.concurrent.Executors; @@ -30,6 +37,7 @@ import javax.security.auth.login.LoginException; +import com.google.common.collect.ImmutableSet; import org.apache.commons.lang.builder.EqualsBuilder; import org.apache.commons.lang.builder.HashCodeBuilder; import org.apache.hadoop.hive.common.classification.InterfaceAudience; @@ -39,6 +47,7 @@ import org.apache.hadoop.hive.metastore.RetryingMetaStoreClient; import org.apache.hadoop.hive.metastore.api.MetaException; import org.apache.hadoop.hive.metastore.annotation.NoReconnect; +import org.apache.hadoop.hive.ql.metadata.HiveUtils; import org.apache.hadoop.hive.shims.ShimLoader; import org.apache.hadoop.hive.shims.Utils; import org.apache.hadoop.security.UserGroupInformation; @@ -90,7 +99,8 @@ private int getThreadId() { } public static IMetaStoreClient getNonCachedHiveMetastoreClient(HiveConf hiveConf) throws MetaException { - return RetryingMetaStoreClient.getProxy(hiveConf, true); + //return RetryingMetaStoreClient.getProxy(hiveConf, true); + return HiveUtils.createMetaStoreClient(hiveConf, true, new ConcurrentHashMap()); } public HiveClientCache(HiveConf hiveConf) { @@ -275,7 +285,24 @@ public IMetaStoreClient get(final HiveConf hiveConf) throws MetaException, IOExc cacheableHiveMetaStoreClient.acquire(); } } - return cacheableHiveMetaStoreClient; + return (IMetaStoreClient)cacheableHiveMetaStoreClient; + } + + private static Class[] getAllInterfaces(Class... classes) { + ImmutableSet.Builder> builder = ImmutableSet.builder(); + Class[] classArray = classes; + int length = classes.length; + + for(int i = 0; i < length; ++i) { + Class element = classArray[i]; + if (element.isInterface()) { + builder.add(element); + } else { + builder.addAll(Arrays.asList(element.getInterfaces())); + } + } + + return (Class[])builder.build().toArray(new Class[0]); } /** @@ -289,17 +316,12 @@ public IMetaStoreClient get(final HiveConf hiveConf) throws MetaException, IOExc private ICacheableMetaStoreClient getOrCreate(final HiveClientCacheKey cacheKey) throws IOException, MetaException, LoginException { try { - return hiveCache.get(cacheKey, new Callable() { - @Override - public ICacheableMetaStoreClient call() throws MetaException { - // This is called from HCat, so always allow embedded metastore (as was the default). - return - (ICacheableMetaStoreClient) RetryingMetaStoreClient.getProxy(cacheKey.getHiveConf(), - new Class[]{HiveConf.class, Integer.class, Boolean.class}, - new Object[]{cacheKey.getHiveConf(), timeout, true}, - CacheableHiveMetaStoreClient.class.getName()); - } - }); + return (ICacheableMetaStoreClient)this.hiveCache.get(cacheKey, new Callable() { + public ICacheableMetaStoreClient call() throws MetaException { + IMetaStoreClient metaStoreClient = HiveClientCache.getNonCachedHiveMetastoreClient(cacheKey.getHiveConf()); + return (ICacheableMetaStoreClient) Proxy.newProxyInstance(HiveClientCache.class.getClassLoader(), HiveClientCache.getAllInterfaces(IMetaStoreClient.class, CacheableHiveMetaStoreClient.class), new CacheableHiveMetaStoreClient(metaStoreClient)); + } + }); } catch (ExecutionException e) { Throwable t = e.getCause(); if (t instanceof IOException) { @@ -368,7 +390,7 @@ public String toString() { } @InterfaceAudience.Private - public interface ICacheableMetaStoreClient extends IMetaStoreClient { + public interface ICacheableMetaStoreClient extends Closeable { @NoReconnect void acquire(); @@ -398,15 +420,18 @@ public interface ICacheableMetaStoreClient extends IMetaStoreClient { /** * Add # of current users on HiveMetaStoreClient, so that the client can be cleaned when no one is using it. */ - static class CacheableHiveMetaStoreClient extends HiveMetaStoreClient implements ICacheableMetaStoreClient { + static class CacheableHiveMetaStoreClient implements InvocationHandler, ICacheableMetaStoreClient { private final AtomicInteger users = new AtomicInteger(0); + + private static final ImmutableSet CLOSE_METHODS; + private static final String CLOSE_METHOD_NAME = "close"; private volatile boolean expiredFromCache = false; + private final IMetaStoreClient base; private boolean isClosed = false; - CacheableHiveMetaStoreClient(final HiveConf conf, final Integer timeout, Boolean allowEmbedded) - throws MetaException { - super(conf, null, allowEmbedded); + CacheableHiveMetaStoreClient(IMetaStoreClient base) { + this.base = base; } /** @@ -468,7 +493,7 @@ public AtomicInteger getUsers() { public boolean isOpen() { try { // Look for an unlikely database name and see if either MetaException or TException is thrown - super.getDatabases("NonExistentDatabaseUsedForHealthCheck"); + this.base.getDatabases("NonExistentDatabaseUsedForHealthCheck"); } catch (TException e) { return false; } @@ -507,7 +532,7 @@ public synchronized void tearDownIfUnused() { public void tearDown() { try { if (!isClosed) { - super.close(); + this.base.close(); } isClosed = true; } catch (Exception e) { @@ -538,5 +563,28 @@ protected void finalize() throws Throwable { super.finalize(); } } + + public Object invoke(Object proxy, Method method, Object[] args) throws Throwable { + try { + if (method.getDeclaringClass().isAssignableFrom(this.getClass())) { + return method.invoke(this, args); + } else if (CLOSE_METHODS.contains(method)) { + this.close(); + return null; + } else { + return method.invoke(this.base, args); + } + } catch (InvocationTargetException var5) { + throw var5.getCause(); + } + } + + static { + try { + CLOSE_METHODS = ImmutableSet.of(AutoCloseable.class.getMethod("close"), Closeable.class.getMethod("close"), IMetaStoreClient.class.getMethod("close"), ICacheableMetaStoreClient.class.getMethod("close")); + } catch (NoSuchMethodException e) { + throw new RuntimeException(e); + } + } } } diff --git a/hcatalog/core/src/main/java/org/apache/hive/hcatalog/mapreduce/DynamicPartitionFileRecordWriterContainer.java b/hcatalog/core/src/main/java/org/apache/hive/hcatalog/mapreduce/DynamicPartitionFileRecordWriterContainer.java index cda8770a2c98..08fa26849c69 100644 --- a/hcatalog/core/src/main/java/org/apache/hive/hcatalog/mapreduce/DynamicPartitionFileRecordWriterContainer.java +++ b/hcatalog/core/src/main/java/org/apache/hive/hcatalog/mapreduce/DynamicPartitionFileRecordWriterContainer.java @@ -43,6 +43,7 @@ import org.apache.hive.hcatalog.common.ErrorType; import org.apache.hive.hcatalog.common.HCatException; import org.apache.hive.hcatalog.data.HCatRecord; +import org.apache.hive.hcatalog.mapreduce.s3.commit.magic.MagicS3GuardCommitter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -157,6 +158,11 @@ protected LocalFileWriter getLocalFileWriter(HCatRecord value) throws IOExceptio HCatMapRedUtil.createTaskAttemptContext(context); configureDynamicStorageHandler(currTaskContext, dynamicPartValues); localJobInfo = HCatBaseOutputFormat.getJobInfo(currTaskContext.getConfiguration()); + boolean isMagic = false; + + if (localJobInfo.getLocation().startsWith("s3")) { + isMagic = true; + } // Setup serDe. AbstractSerDe currSerDe = @@ -182,9 +188,11 @@ protected LocalFileWriter getLocalFileWriter(HCatRecord value) throws IOExceptio // but may become an issue for cases when the method is used to perform // other setup tasks. + Path outputLoc = new Path(localJobInfo.getLocation()); + // Get Output Committer - org.apache.hadoop.mapred.OutputCommitter baseOutputCommitter = - currTaskContext.getJobConf().getOutputCommitter(); + org.apache.hadoop.mapred.OutputCommitter baseOutputCommitter = isMagic ? + new MagicS3GuardCommitter(outputLoc, currTaskContext) : currTaskContext.getJobConf().getOutputCommitter(); // Create currJobContext the latest so it gets all the config changes org.apache.hadoop.mapred.JobContext currJobContext = @@ -199,10 +207,17 @@ protected LocalFileWriter getLocalFileWriter(HCatRecord value) throws IOExceptio currTaskContext.getTaskAttemptID(), currTaskContext.getProgressible()); // Set temp location. - currTaskContext.getConfiguration().set( - "mapred.work.output.dir", - new FileOutputCommitter(new Path(localJobInfo.getLocation()), currTaskContext) - .getWorkPath().toString()); + if (isMagic) { + currTaskContext.getConfiguration().set( + "mapred.work.output.dir", + outputLoc.toString()); + } + else { + currTaskContext.getConfiguration().set( + "mapred.work.output.dir", + new FileOutputCommitter(new Path(localJobInfo.getLocation()), currTaskContext) + .getWorkPath().toString()); + } // Set up task. baseOutputCommitter.setupTask(currTaskContext); diff --git a/hcatalog/core/src/main/java/org/apache/hive/hcatalog/mapreduce/FileOutputFormatContainer.java b/hcatalog/core/src/main/java/org/apache/hive/hcatalog/mapreduce/FileOutputFormatContainer.java index d503550d5dae..bd7fe77546fd 100644 --- a/hcatalog/core/src/main/java/org/apache/hive/hcatalog/mapreduce/FileOutputFormatContainer.java +++ b/hcatalog/core/src/main/java/org/apache/hive/hcatalog/mapreduce/FileOutputFormatContainer.java @@ -46,6 +46,7 @@ import org.apache.hive.hcatalog.common.HCatException; import org.apache.hive.hcatalog.common.HCatUtil; import org.apache.hive.hcatalog.data.HCatRecord; +import org.apache.hive.hcatalog.mapreduce.s3.commit.magic.MagicS3GuardCommitter; import org.apache.thrift.TException; import java.io.IOException; @@ -142,10 +143,18 @@ public void checkOutputSpecs(JobContext context) throws IOException, Interrupted public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException { //this needs to be manually set, under normal circumstances MR Task does this setWorkOutputPath(context); - return new FileOutputCommitterContainer(context, - HCatBaseOutputFormat.getJobInfo(context.getConfiguration()).isDynamicPartitioningUsed() ? - null : - new JobConf(context.getConfiguration()).getOutputCommitter()); + if (true/*HCatUtil.isS3A(context.getConfiguration())*/) { + return new S3OutputCommitterContainer(context, + HCatBaseOutputFormat.getJobInfo(context.getConfiguration()).isDynamicPartitioningUsed() ? + null : + new MagicS3GuardCommitter(new Path(context.getConfiguration().get("mapred.output.dir")), context)); + } + else { + return new FileOutputCommitterContainer(context, + HCatBaseOutputFormat.getJobInfo(context.getConfiguration()).isDynamicPartitioningUsed() ? + null : + new JobConf(context.getConfiguration()).getOutputCommitter()); + } } /** @@ -246,8 +255,15 @@ static void setWorkOutputPath(TaskAttemptContext context) throws IOException { String outputPath = context.getConfiguration().get("mapred.output.dir"); //we need to do this to get the task path and set it for mapred implementation //since it can't be done automatically because of mapreduce->mapred abstraction - if (outputPath != null) - context.getConfiguration().set("mapred.work.output.dir", - new FileOutputCommitter(new Path(outputPath), context).getWorkPath().toString()); + if (outputPath != null) { + if (HCatUtil.isS3(new Path(outputPath))) { + context.getConfiguration().set("mapred.work.output.dir", + new MagicS3GuardCommitter(new Path(outputPath), context).getWorkPath().toString()); + } + else { + context.getConfiguration().set("mapred.work.output.dir", + new FileOutputCommitter(new Path(outputPath), context).getWorkPath().toString()); + } + } } } diff --git a/hcatalog/core/src/main/java/org/apache/hive/hcatalog/mapreduce/FosterStorageHandler.java b/hcatalog/core/src/main/java/org/apache/hive/hcatalog/mapreduce/FosterStorageHandler.java index 195eaa367933..ae9e6f4f8418 100644 --- a/hcatalog/core/src/main/java/org/apache/hive/hcatalog/mapreduce/FosterStorageHandler.java +++ b/hcatalog/core/src/main/java/org/apache/hive/hcatalog/mapreduce/FosterStorageHandler.java @@ -151,6 +151,10 @@ public void configureOutputJobProperties(TableDesc tableDesc, HCatUtil.deserialize(tableDesc.getJobProperties().get( HCatConstants.HCAT_KEY_OUTPUT_INFO)); String parentPath = jobInfo.getTableInfo().getTableLocation(); + boolean isMagic = false; + if (parentPath.startsWith("s3")) { + isMagic = true; + } String dynHash = tableDesc.getJobProperties().get( HCatConstants.HCAT_DYNAMIC_PTN_JOBID); String idHash = tableDesc.getJobProperties().get( @@ -165,9 +169,13 @@ public void configureOutputJobProperties(TableDesc tableDesc, && jobInfo.getCustomDynamicRoot().length() > 0) { parentPath = new Path(parentPath, jobInfo.getCustomDynamicRoot()).toString(); } - parentPath = new Path(parentPath, FileOutputCommitterContainer.DYNTEMP_DIR_NAME + dynHash).toString(); + if (!isMagic) { + parentPath = new Path(parentPath, FileOutputCommitterContainer.DYNTEMP_DIR_NAME + dynHash).toString(); + } } else { - parentPath = new Path(parentPath,FileOutputCommitterContainer.SCRATCH_DIR_NAME + idHash).toString(); + if (!isMagic) { + parentPath = new Path(parentPath,FileOutputCommitterContainer.SCRATCH_DIR_NAME + idHash).toString(); + } } String outputLocation; @@ -183,10 +191,20 @@ public void configureOutputJobProperties(TableDesc tableDesc, && Boolean.parseBoolean((String)tableDesc.getProperties().get("EXTERNAL")) && jobInfo.getLocation() != null && jobInfo.getLocation().length() > 0) { // honor custom location for external table apart from what metadata specifies - outputLocation = jobInfo.getLocation(); + if (!isMagic) { + outputLocation = jobInfo.getLocation(); + } + else { + outputLocation = jobInfo.getLocation() + "/__magic"; + } } else if (dynHash == null && jobInfo.getPartitionValues().size() == 0) { // Unpartitioned table, writing to the scratch dir directly is good enough. - outputLocation = ""; + if (!isMagic) { + outputLocation = ""; + } + else { + outputLocation = "__magic"; + } } else { List cols = new ArrayList(); List values = new ArrayList(); diff --git a/hcatalog/core/src/main/java/org/apache/hive/hcatalog/mapreduce/S3OutputCommitterContainer.java b/hcatalog/core/src/main/java/org/apache/hive/hcatalog/mapreduce/S3OutputCommitterContainer.java new file mode 100644 index 000000000000..3d523e85e54a --- /dev/null +++ b/hcatalog/core/src/main/java/org/apache/hive/hcatalog/mapreduce/S3OutputCommitterContainer.java @@ -0,0 +1,826 @@ +package org.apache.hive.hcatalog.mapreduce; + +import org.apache.commons.lang.StringUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.permission.FsPermission; +import org.apache.hadoop.hive.common.FileUtils; +import org.apache.hadoop.hive.common.StatsSetupConst; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.IMetaStoreClient; +import org.apache.hadoop.hive.metastore.Warehouse; +import org.apache.hadoop.hive.metastore.api.*; +import org.apache.hadoop.hive.ql.metadata.HiveStorageHandler; +import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.shims.ShimLoader; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapreduce.JobContext; +import org.apache.hadoop.mapreduce.JobStatus; +import org.apache.hadoop.mapreduce.TaskAttemptContext; +import org.apache.hive.hcatalog.common.ErrorType; +import org.apache.hive.hcatalog.common.HCatConstants; +import org.apache.hive.hcatalog.common.HCatException; +import org.apache.hive.hcatalog.common.HCatUtil; +import org.apache.hive.hcatalog.data.schema.HCatFieldSchema; +import org.apache.hive.hcatalog.data.schema.HCatSchema; +import org.apache.hive.hcatalog.data.schema.HCatSchemaUtils; +import org.apache.hive.hcatalog.har.HarOutputCommitterPostProcessor; +import org.apache.thrift.TException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.net.URI; +import java.util.*; + +/** + * Part of the *Output*Container classes + * See {@link FileOutputFormatContainer} for more information + */ +class S3OutputCommitterContainer extends OutputCommitterContainer { + + private static final String TEMP_DIR_NAME = "_temporary"; + private static final String LOGS_DIR_NAME = "_logs"; + + static final String SCRATCH_DIR_NAME = "_SCRATCH"; + private static final String APPEND_SUFFIX = "_a_"; + private static final int APPEND_COUNTER_WARN_THRESHOLD = 1000; + private final int maxAppendAttempts; + + private static final Logger LOG = LoggerFactory.getLogger(S3OutputCommitterContainer.class); + private final boolean dynamicPartitioningUsed; + private boolean partitionsDiscovered; + private final boolean customDynamicLocationUsed; + + private Map> partitionsDiscoveredByPath; + private Map contextDiscoveredByPath; + private final HiveStorageHandler cachedStorageHandler; + + HarOutputCommitterPostProcessor harProcessor = new HarOutputCommitterPostProcessor(); + + private String ptnRootLocation = null; + + private OutputJobInfo jobInfo = null; + + /** + * @param context current JobContext + * @param baseCommitter OutputCommitter to contain + * @throws IOException + */ + public S3OutputCommitterContainer(JobContext context, + org.apache.hadoop.mapred.OutputCommitter baseCommitter) throws IOException { + super(context, baseCommitter); + jobInfo = HCatOutputFormat.getJobInfo(context.getConfiguration()); + dynamicPartitioningUsed = jobInfo.isDynamicPartitioningUsed(); + + this.partitionsDiscovered = !dynamicPartitioningUsed; + cachedStorageHandler = HCatUtil.getStorageHandler(context.getConfiguration(), jobInfo.getTableInfo().getStorerInfo()); + Table table = new Table(jobInfo.getTableInfo().getTable()); + if (dynamicPartitioningUsed && Boolean.parseBoolean((String)table.getProperty("EXTERNAL")) + && jobInfo.getCustomDynamicPath() != null + && jobInfo.getCustomDynamicPath().length() > 0) { + customDynamicLocationUsed = true; + } else { + customDynamicLocationUsed = false; + } + + this.maxAppendAttempts = context.getConfiguration().getInt(HCatConstants.HCAT_APPEND_LIMIT, APPEND_COUNTER_WARN_THRESHOLD); + } + + @Override + public void abortTask(TaskAttemptContext context) throws IOException { + if (!dynamicPartitioningUsed) { + FileOutputFormatContainer.setWorkOutputPath(context); + getBaseOutputCommitter().abortTask(HCatMapRedUtil.createTaskAttemptContext(context)); + } else { + try { + TaskCommitContextRegistry.getInstance().abortTask(context); + } + finally { + TaskCommitContextRegistry.getInstance().discardCleanupFor(context); + } + } + } + + @Override + public void commitTask(TaskAttemptContext context) throws IOException { + if (!dynamicPartitioningUsed) { + //See HCATALOG-499 + FileOutputFormatContainer.setWorkOutputPath(context); + getBaseOutputCommitter().commitTask(HCatMapRedUtil.createTaskAttemptContext(context)); + } else { + try { + TaskCommitContextRegistry.getInstance().commitTask(context); + } + finally { + TaskCommitContextRegistry.getInstance().discardCleanupFor(context); + } + } + } + + @Override + public boolean needsTaskCommit(TaskAttemptContext context) throws IOException { + if (!dynamicPartitioningUsed) { + FileOutputFormatContainer.setWorkOutputPath(context); + return getBaseOutputCommitter().needsTaskCommit(HCatMapRedUtil.createTaskAttemptContext(context)); + } else { + // called explicitly through FileRecordWriterContainer.close() if dynamic - return false by default + return true; + } + } + + @Override + public void setupJob(JobContext context) throws IOException { + if (getBaseOutputCommitter() != null && !dynamicPartitioningUsed) { + getBaseOutputCommitter().setupJob(HCatMapRedUtil.createJobContext(context)); + } + // in dynamic usecase, called through FileRecordWriterContainer + } + + @Override + public void setupTask(TaskAttemptContext context) throws IOException { + if (!dynamicPartitioningUsed) { + getBaseOutputCommitter().setupTask(HCatMapRedUtil.createTaskAttemptContext(context)); + } + } + + @Override + public void abortJob(JobContext jobContext, JobStatus.State state) throws IOException { + try { + if (dynamicPartitioningUsed) { + discoverPartitions(jobContext); + } + org.apache.hadoop.mapred.JobContext mapRedJobContext = HCatMapRedUtil + .createJobContext(jobContext); + if (getBaseOutputCommitter() != null && !dynamicPartitioningUsed) { + getBaseOutputCommitter().abortJob(mapRedJobContext, state); + } else if (dynamicPartitioningUsed) { + for (JobContext currContext : contextDiscoveredByPath.values()) { + try { + new JobConf(currContext.getConfiguration()) + .getOutputCommitter().abortJob(currContext, + state); + } catch (Exception e) { + throw new IOException(e); + } + } + } + Path src; + OutputJobInfo jobInfo = HCatOutputFormat.getJobInfo(jobContext.getConfiguration()); + Path tblPath = new Path(jobInfo.getTableInfo().getTableLocation()); + if (dynamicPartitioningUsed) { + if (!customDynamicLocationUsed) { + src = new Path(getPartitionRootLocation(jobInfo.getLocation(), jobInfo.getTableInfo().getTable() + .getPartitionKeysSize())); + } else { + src = new Path(getCustomPartitionRootLocation(jobInfo, jobContext.getConfiguration())); + } + } else { + src = new Path(jobInfo.getLocation()); + } + FileSystem fs = src.getFileSystem(jobContext.getConfiguration()); + // Note fs.delete will fail on Windows. The reason is in OutputCommitter, + // Hadoop is still writing to _logs/history. On Linux, OS don't care file is still + // open and remove the directory anyway, but on Windows, OS refuse to remove a + // directory containing open files. So on Windows, we will leave output directory + // behind when job fail. User needs to remove the output directory manually + LOG.info("Job failed. Try cleaning up temporary directory [{}].", src); + if (!src.equals(tblPath)){ + fs.delete(src, true); + } + } finally { + cancelDelegationTokens(jobContext); + } + } + + public static final String SUCCEEDED_FILE_NAME = "_SUCCESS"; + static final String SUCCESSFUL_JOB_OUTPUT_DIR_MARKER = + "mapreduce.fileoutputcommitter.marksuccessfuljobs"; + + private static boolean getOutputDirMarking(Configuration conf) { + return conf.getBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER, + false); + } + + @Override + public void commitJob(JobContext jobContext) throws IOException { + if (dynamicPartitioningUsed) { + discoverPartitions(jobContext); + // Commit each partition so it gets moved out of the job work + // dir + for (JobContext context : contextDiscoveredByPath.values()) { + new JobConf(context.getConfiguration()) + .getOutputCommitter().commitJob(context); + } + } + if (getBaseOutputCommitter() != null && !dynamicPartitioningUsed) { + getBaseOutputCommitter().commitJob( + HCatMapRedUtil.createJobContext(jobContext)); + } + registerPartitions(jobContext); + // create _SUCCESS FILE if so requested. + OutputJobInfo jobInfo = HCatOutputFormat.getJobInfo(jobContext.getConfiguration()); + if (getOutputDirMarking(jobContext.getConfiguration())) { + Path outputPath = new Path(jobInfo.getLocation()); + FileSystem fileSys = outputPath.getFileSystem(jobContext + .getConfiguration()); + // create a file in the folder to mark it + if (fileSys.exists(outputPath)) { + Path filePath = new Path(outputPath, + SUCCEEDED_FILE_NAME); + if (!fileSys.exists(filePath)) { // may have been + // created by + // baseCommitter.commitJob() + fileSys.create(filePath).close(); + } + } + } + + // Commit has succeeded (since no exceptions have been thrown.) + // Safe to cancel delegation tokens now. + cancelDelegationTokens(jobContext); + } + + @Override + public void cleanupJob(JobContext context) throws IOException { + throw new IOException("The method cleanupJob is deprecated and should not be called."); + } + + private String getCustomPartitionRootLocation(OutputJobInfo jobInfo, Configuration conf) { + if (ptnRootLocation == null) { + // we only need to calculate it once, it'll be the same for other partitions in this job. + String parentPath = jobInfo.getTableInfo().getTableLocation(); + if (jobInfo.getCustomDynamicRoot() != null + && jobInfo.getCustomDynamicRoot().length() > 0) { + parentPath = new Path(parentPath, jobInfo.getCustomDynamicRoot()).toString(); + } + Path ptnRoot; + ptnRoot = new Path(parentPath, "__magic/" + + conf.get(HCatConstants.HCAT_DYNAMIC_PTN_JOBID)); + ptnRootLocation = ptnRoot.toString(); + } + //Check if scratch directory has relevant files that we also need to push to __magic + return ptnRootLocation; + } + + private String getPartitionRootLocation(String ptnLocn, int numPtnKeys) { + if (customDynamicLocationUsed) { + return null; + } + + if (ptnRootLocation == null) { + // we only need to calculate it once, it'll be the same for other partitions in this job. + Path ptnRoot = new Path(ptnLocn); + for (int i = 0; i < numPtnKeys; i++) { +// LOG.info("Getting parent of "+ptnRoot.getName()); + ptnRoot = ptnRoot.getParent(); + } + ptnRootLocation = ptnRoot.toString(); + } +// LOG.info("Returning final parent : "+ptnRootLocation); + return ptnRootLocation; + } + + /** + * Generate partition metadata object to be used to add to metadata. + * @param context The job context. + * @param jobInfo The OutputJobInfo. + * @param partLocnRoot The table-equivalent location root of the partition + * (temporary dir if dynamic partition, table dir if static) + * @param dynPartPath The path of dynamic partition which is created + * @param partKVs The keyvalue pairs that form the partition + * @param outputSchema The output schema for the partition + * @param params The parameters to store inside the partition + * @param table The Table metadata object under which this Partition will reside + * @param fs FileSystem object to operate on the underlying filesystem + * @param grpName Group name that owns the table dir + * @param perms FsPermission that's the default permission of the table dir. + * @return Constructed Partition metadata object + * @throws java.io.IOException + */ + + private Partition constructPartition( + JobContext context, OutputJobInfo jobInfo, + String partLocnRoot, String dynPartPath, Map partKVs, + HCatSchema outputSchema, Map params, + Table table, FileSystem fs, + String grpName, FsPermission perms) throws IOException { + + Partition partition = new Partition(); + partition.setDbName(table.getDbName()); + partition.setTableName(table.getTableName()); + partition.setSd(new StorageDescriptor(table.getTTable().getSd())); + + List fields = new ArrayList(); + for (HCatFieldSchema fieldSchema : outputSchema.getFields()) { + fields.add(HCatSchemaUtils.getFieldSchema(fieldSchema)); + } + + partition.getSd().setCols(fields); + + partition.setValues(FileOutputFormatContainer.getPartitionValueList(table, partKVs)); + + partition.setParameters(params); + + // Sets permissions and group name on partition dirs and files. + + Path partPath; + if (customDynamicLocationUsed) { + partPath = new Path(dynPartPath); + } else if (!dynamicPartitioningUsed + && Boolean.parseBoolean((String)table.getProperty("EXTERNAL")) + && jobInfo.getLocation() != null && jobInfo.getLocation().length() > 0) { + String jobLocation = jobInfo.getLocation(); //need to change this for magic + partPath = new Path(jobLocation); + } else { + partPath = new Path(partLocnRoot); + int i = 0; + for (FieldSchema partKey : table.getPartitionKeys()) { + if (i++ != 0) { + fs.mkdirs(partPath); // Attempt to make the path in case it does not exist before we check + applyGroupAndPerms(fs, partPath, perms, grpName, false); + } + partPath = constructPartialPartPath(partPath, partKey.getName().toLowerCase(), partKVs); + } + } + + // Apply the group and permissions to the leaf partition and files. + // Need not bother in case of HDFS as permission is taken care of by setting UMask + fs.mkdirs(partPath); // Attempt to make the path in case it does not exist before we check + if (!ShimLoader.getHadoopShims().getHCatShim().isFileInHDFS(fs, partPath)) { + applyGroupAndPerms(fs, partPath, perms, grpName, true); + } + + // Set the location in the StorageDescriptor + if (dynamicPartitioningUsed) { + String dynamicPartitionDestination = getFinalDynamicPartitionDestination(table, partKVs, jobInfo); + if (harProcessor.isEnabled()) { + harProcessor.exec(context, partition, partPath); + partition.getSd().setLocation( + harProcessor.getProcessedLocation(new Path(dynamicPartitionDestination))); + } else { + partition.getSd().setLocation(dynamicPartitionDestination); + } + } else { + partition.getSd().setLocation(partPath.toString()); + } + return partition; + } + + private void applyGroupAndPerms(FileSystem fs, Path dir, FsPermission permission, + String group, boolean recursive) + throws IOException { + if(LOG.isDebugEnabled()) { + LOG.debug("applyGroupAndPerms : " + dir + + " perms: " + permission + + " group: " + group + " recursive: " + recursive); + } + fs.setPermission(dir, permission); + if (recursive) { + for (FileStatus fileStatus : fs.listStatus(dir)) { + if (fileStatus.isDir()) { + applyGroupAndPerms(fs, fileStatus.getPath(), permission, group, true); + } else { + fs.setPermission(fileStatus.getPath(), permission); + } + } + } + } + + private String getFinalDynamicPartitionDestination(Table table, Map partKVs, + OutputJobInfo jobInfo) { + Path partPath = new Path(table.getTTable().getSd().getLocation()); + if (!customDynamicLocationUsed) { + // file:///tmp/hcat_junit_warehouse/employee/_DYN0.7770480401313761/emp_country=IN/emp_state=KA -> + // file:///tmp/hcat_junit_warehouse/employee/emp_country=IN/emp_state=KA + for (FieldSchema partKey : table.getPartitionKeys()) { + partPath = constructPartialPartPath(partPath, partKey.getName().toLowerCase(), partKVs); + } + + return partPath.toString(); + } else { + // if custom root specified, update the parent path + if (jobInfo.getCustomDynamicRoot() != null + && jobInfo.getCustomDynamicRoot().length() > 0) { + partPath = new Path(partPath, jobInfo.getCustomDynamicRoot()); + } + return new Path(partPath, HCatFileUtil.resolveCustomPath(jobInfo, partKVs, false)).toString(); + } + } + + private Map getStorerParameterMap(StorerInfo storer) { + Map params = new HashMap(); + + //Copy table level hcat.* keys to the partition + for (Map.Entry entry : storer.getProperties().entrySet()) { + if (!entry.getKey().toString().equals(StatsSetupConst.COLUMN_STATS_ACCURATE)) { + params.put(entry.getKey().toString(), entry.getValue().toString()); + } + } + return params; + } + + private Path constructPartialPartPath(Path partialPath, String partKey, Map partKVs) { + + StringBuilder sb = new StringBuilder(FileUtils.escapePathName(partKey)); + sb.append("="); + sb.append(FileUtils.escapePathName(partKVs.get(partKey))); + return new Path(partialPath, sb.toString()); + } + + /** + * Update table schema, adding new columns as added for the partition. + * @param client the client + * @param table the table + * @param partitionSchema the schema of the partition + * @throws java.io.IOException Signals that an I/O exception has occurred. + * @throws org.apache.hadoop.hive.metastore.api.InvalidOperationException the invalid operation exception + * @throws org.apache.hadoop.hive.metastore.api.MetaException the meta exception + * @throws org.apache.thrift.TException the t exception + */ + private void updateTableSchema(IMetaStoreClient client, Table table, + HCatSchema partitionSchema) throws IOException, InvalidOperationException, MetaException, TException { + + + List newColumns = HCatUtil.validatePartitionSchema(table, partitionSchema); + + if (newColumns.size() != 0) { + List tableColumns = new ArrayList(table.getTTable().getSd().getCols()); + tableColumns.addAll(newColumns); + + //Update table schema to add the newly added columns + table.getTTable().getSd().setCols(tableColumns); + client.alter_table(table.getDbName(), table.getTableName(), table.getTTable()); + } + } + + /** + * Find the final name of a given output file, given the output directory + * and the work directory. If immutable, attempt to create file of name + * _aN till we find an item that does not exist. + * @param file the file to move + * @param src the source directory + * @param dest the target directory + * @return the final path for the specific output file + * @throws java.io.IOException + */ + private Path getFinalPath(FileSystem fs, Path file, Path src, + Path dest, final boolean immutable) throws IOException { + URI taskOutputUri = file.toUri(); + URI relativePath = src.toUri().relativize(taskOutputUri); + if (taskOutputUri == relativePath) { + throw new HCatException(ErrorType.ERROR_MOVE_FAILED, "Can not get the relative path: base = " + + src + " child = " + file); + } + if (relativePath.getPath().length() > 0) { + + Path itemDest = new Path(dest, relativePath.getPath()); + if (!immutable){ + String name = relativePath.getPath(); + String filetype; + int index = name.lastIndexOf('.'); + if (index >= 0) { + filetype = name.substring(index); + name = name.substring(0, index); + } else { + filetype = ""; + } + + // Attempt to find maxAppendAttempts possible alternatives to a filename by + // appending _a_N and seeing if that destination also clashes. If we're + // still clashing after that, give up. + int counter = 1; + for (; fs.exists(itemDest) && counter < maxAppendAttempts; counter++) { + itemDest = new Path(dest, name + (APPEND_SUFFIX + counter) + filetype); + } + + if (counter == maxAppendAttempts){ + throw new HCatException(ErrorType.ERROR_MOVE_FAILED, + "Could not find a unique destination path for move: file = " + + file + " , src = " + src + ", dest = " + dest); + } else if (counter > APPEND_COUNTER_WARN_THRESHOLD) { + LOG.warn("Append job used filename clash counter [" + counter + +"] which is greater than warning limit [" + APPEND_COUNTER_WARN_THRESHOLD + +"]. Please compact this table so that performance is not impacted." + + " Please see HIVE-9381 for details."); + } + + } + + if (LOG.isDebugEnabled()){ + LOG.debug("FinalPath(file:"+file+":"+src+"->"+dest+"="+itemDest); + } + + return itemDest; + } else { + + return dest; + } + } + + /** + * Run to discover dynamic partitions available + */ + private void discoverPartitions(JobContext context) throws IOException { + if (!partitionsDiscovered) { + // LOG.info("discover ptns called"); + OutputJobInfo jobInfo = HCatOutputFormat.getJobInfo(context.getConfiguration()); + + harProcessor.setEnabled(jobInfo.getHarRequested()); + + List dynamicPartCols = jobInfo.getPosOfDynPartCols(); + int maxDynamicPartitions = jobInfo.getMaxDynamicPartitions(); + + Path loadPath = new Path(jobInfo.getLocation()); + FileSystem fs = loadPath.getFileSystem(context.getConfiguration()); + + // construct a path pattern (e.g., /*/*) to find all dynamically generated paths + String dynPathSpec = loadPath.toUri().getPath(); + dynPathSpec = dynPathSpec.replaceAll("__HIVE_DEFAULT_PARTITION__", "*"); + + // LOG.info("Searching for "+dynPathSpec); + Path pathPattern = new Path(dynPathSpec); + FileStatus[] status = fs.globStatus(pathPattern, FileUtils.HIDDEN_FILES_PATH_FILTER); + + partitionsDiscoveredByPath = new LinkedHashMap>(); + contextDiscoveredByPath = new LinkedHashMap(); + + + if (status.length == 0) { + // LOG.warn("No partition found genereated by dynamic partitioning in [" + // +loadPath+"] with depth["+jobInfo.getTable().getPartitionKeysSize() + // +"], dynSpec["+dynPathSpec+"]"); + } else { + if ((maxDynamicPartitions != -1) && (status.length > maxDynamicPartitions)) { + this.partitionsDiscovered = true; + throw new HCatException(ErrorType.ERROR_TOO_MANY_DYNAMIC_PTNS, + "Number of dynamic partitions being created " + + "exceeds configured max allowable partitions[" + + maxDynamicPartitions + + "], increase parameter [" + + HiveConf.ConfVars.DYNAMICPARTITIONMAXPARTS.varname + + "] if needed."); + } + + for (FileStatus st : status) { + LinkedHashMap fullPartSpec = new LinkedHashMap(); + if (!customDynamicLocationUsed) { + Warehouse.makeSpecFromName(fullPartSpec, st.getPath(), null); + } else { + HCatFileUtil.getPartKeyValuesForCustomLocation(fullPartSpec, jobInfo, + st.getPath().toString()); + } + partitionsDiscoveredByPath.put(st.getPath().toString(), fullPartSpec); + JobConf jobConf = (JobConf)context.getConfiguration(); + JobContext currContext = HCatMapRedUtil.createJobContext( + jobConf, + context.getJobID(), + InternalUtil.createReporter(HCatMapRedUtil.createTaskAttemptContext(jobConf, + ShimLoader.getHadoopShims().getHCatShim().createTaskAttemptID()))); + HCatOutputFormat.configureOutputStorageHandler(currContext, jobInfo, fullPartSpec); + contextDiscoveredByPath.put(st.getPath().toString(), currContext); + } + } + + // for (Entry> spec : partitionsDiscoveredByPath.entrySet()){ + // LOG.info("Partition "+ spec.getKey()); + // for (Entry e : spec.getValue().entrySet()){ + // LOG.info(e.getKey() + "=>" +e.getValue()); + // } + // } + + this.partitionsDiscovered = true; + } + } + + private void registerPartitions(JobContext context) throws IOException{ + if (dynamicPartitioningUsed){ + discoverPartitions(context); + } + OutputJobInfo jobInfo = HCatOutputFormat.getJobInfo(context.getConfiguration()); + Configuration conf = context.getConfiguration(); + Table table = new Table(jobInfo.getTableInfo().getTable()); + Path tblPath = new Path(table.getTTable().getSd().getLocation()); + FileSystem fs = tblPath.getFileSystem(conf); + IMetaStoreClient client = null; + HCatTableInfo tableInfo = jobInfo.getTableInfo(); + List partitionsAdded = new ArrayList(); + try { + HiveConf hiveConf = HCatUtil.getHiveConf(conf); + client = HCatUtil.getHiveMetastoreClient(hiveConf); + if (table.getPartitionKeys().size() == 0) { + // Move data from temp directory the actual table directory + // No metastore operation required. + Path src = new Path(jobInfo.getLocation()); + if (!src.equals(tblPath)) { + fs.delete(src, true); + } + if (table.getParameters() != null + && table.getParameters().containsKey(StatsSetupConst.COLUMN_STATS_ACCURATE)) { + table.getParameters().remove(StatsSetupConst.COLUMN_STATS_ACCURATE); + client.alter_table(table.getDbName(), table.getTableName(), table.getTTable()); + } + return; + } + + StorerInfo storer = InternalUtil.extractStorerInfo(table.getTTable().getSd(), + table.getParameters()); + + FileStatus tblStat = fs.getFileStatus(tblPath); + String grpName = tblStat.getGroup(); + FsPermission perms = tblStat.getPermission(); + + List partitionsToAdd = new ArrayList(); + if (!dynamicPartitioningUsed) { + partitionsToAdd.add(constructPartition(context, jobInfo, tblPath.toString(), null, + jobInfo.getPartitionValues(), jobInfo.getOutputSchema(), getStorerParameterMap(storer), + table, fs, grpName, perms)); + } else { + for (Map.Entry> entry : partitionsDiscoveredByPath.entrySet()) { + partitionsToAdd.add(constructPartition(context, jobInfo, + getPartitionRootLocation(entry.getKey(), entry.getValue().size()), entry.getKey(), + entry.getValue(), jobInfo.getOutputSchema(), getStorerParameterMap(storer), table, + fs, grpName, perms)); + } + } + + ArrayList> ptnInfos = new ArrayList>(); + for(Partition ptn : partitionsToAdd){ + ptnInfos.add(InternalUtil.createPtnKeyValueMap(new Table(tableInfo.getTable()), ptn)); + } + + /** + * Dynamic partitioning & Append incompatibility note: + * + * Currently, we do not support mixing dynamic partitioning and append in the + * same job. One reason is that we need exhaustive testing of corner cases + * for that, and a second reason is the behaviour of add_partitions. To support + * dynamic partitioning with append, we'd have to have a add_partitions_if_not_exist + * call, rather than an add_partitions call. Thus far, we've tried to keep the + * implementation of append jobtype-agnostic, but here, in code, we assume that + * a table is considered immutable if dynamic partitioning is enabled on the job. + * + * This does not mean that we can check before the job begins that this is going + * to be a dynamic partition job on an immutable table and thus fail the job, since + * it is quite possible to have a dynamic partitioning job run on an unpopulated + * immutable table. It simply means that at the end of the job, as far as copying + * in data is concerned, we will pretend that the table is immutable irrespective + * of what table.isImmutable() tells us. + */ + + //Publish the new partition(s) + if (dynamicPartitioningUsed && harProcessor.isEnabled() && (!partitionsToAdd.isEmpty())){ + + if (!customDynamicLocationUsed) { + Path src = new Path(ptnRootLocation); + // check here for each dir we're copying out, to see if it + // already exists, error out if so. + // Also, treat dyn-writes as writes to immutable tables. + //moveTaskOutputs(fs, src, src, tblPath, true, true); // dryRun = true, immutable = true + //moveTaskOutputs(fs, src, src, tblPath, false, true); + if (!src.equals(tblPath)){ + fs.delete(src, true); + } + } else { + } + try { + updateTableSchema(client, table, jobInfo.getOutputSchema()); + LOG.info("HAR is being used. The table {} has new partitions {}.", table.getTableName(), ptnInfos); + client.add_partitions(partitionsToAdd); + partitionsAdded = partitionsToAdd; + } catch (Exception e){ + // There was an error adding partitions : rollback fs copy and rethrow + for (Partition p : partitionsToAdd){ + Path ptnPath = new Path(harProcessor.getParentFSPath(new Path(p.getSd().getLocation()))); + if (fs.exists(ptnPath)){ + fs.delete(ptnPath,true); + } + } + throw e; + } + + }else{ + + // no harProcessor, regular operation + updateTableSchema(client, table, jobInfo.getOutputSchema()); + LOG.info("HAR not is not being used. The table {} has new partitions {}.", table.getTableName(), ptnInfos); + if (partitionsToAdd.size() > 0){ + if (!dynamicPartitioningUsed ) { + + // regular single-partition write into a partitioned table. + //Move data from temp directory the actual table directory + if (partitionsToAdd.size() > 1){ + throw new HCatException(ErrorType.ERROR_PUBLISHING_PARTITION, + "More than one partition to publish in non-dynamic partitioning job"); + } + Partition p = partitionsToAdd.get(0); + + // Now, we check if the partition already exists. If not, we go ahead. + // If so, we error out if immutable, and if mutable, check that the partition's IF + // matches our current job's IF (table's IF) to check for compatibility. If compatible, we + // ignore and do not add. If incompatible, we error out again. + + boolean publishRequired = false; + try { + Partition existingP = client.getPartition(p.getDbName(),p.getTableName(),p.getValues()); + if (existingP != null){ + if (table.isImmutable()){ + throw new HCatException(ErrorType.ERROR_DUPLICATE_PARTITION, + "Attempted duplicate partition publish on to immutable table"); + } else { + if (! existingP.getSd().getInputFormat().equals(table.getInputFormatClass().getName())){ + throw new HCatException(ErrorType.ERROR_PUBLISHING_PARTITION, + "Attempted partition append, where old partition format was " + + existingP.getSd().getInputFormat() + + " and table format was " + + table.getInputFormatClass().getName()); + } + } + } else { + publishRequired = true; + } + } catch (NoSuchObjectException e){ + // All good, no such partition exists, move on. + publishRequired = true; + } + if (publishRequired){ + client.add_partitions(partitionsToAdd); + partitionsAdded = partitionsToAdd; + } + + } else { + client.add_partitions(partitionsToAdd); + partitionsAdded = partitionsToAdd; + } + } + + // Set permissions appropriately for each of the partitions we just created + // so as to have their permissions mimic the table permissions + for (Partition p : partitionsAdded){ + applyGroupAndPerms(fs,new Path(p.getSd().getLocation()),tblStat.getPermission(),tblStat.getGroup(),true); + } + + } + } catch (Exception e) { + if (partitionsAdded.size() > 0) { + try { + // baseCommitter.cleanupJob failed, try to clean up the + // metastore + for (Partition p : partitionsAdded) { + client.dropPartition(tableInfo.getDatabaseName(), + tableInfo.getTableName(), p.getValues(), true); + } + } catch (Exception te) { + // Keep cause as the original exception + throw new HCatException( + ErrorType.ERROR_PUBLISHING_PARTITION, e); + } + } + if (e instanceof HCatException) { + throw (HCatException) e; + } else { + throw new HCatException(ErrorType.ERROR_PUBLISHING_PARTITION, e); + } + } finally { + HCatUtil.closeHiveClientQuietly(client); + } + } + + private void cancelDelegationTokens(JobContext context) throws IOException{ + LOG.info("Cancelling delegation token for the job."); + IMetaStoreClient client = null; + try { + HiveConf hiveConf = HCatUtil + .getHiveConf(context.getConfiguration()); + client = HCatUtil.getHiveMetastoreClient(hiveConf); + // cancel the deleg. tokens that were acquired for this job now that + // we are done - we should cancel if the tokens were acquired by + // HCatOutputFormat and not if they were supplied by Oozie. + // In the latter case the HCAT_KEY_TOKEN_SIGNATURE property in + // the conf will not be set + String tokenStrForm = client.getTokenStrForm(); + String hCatKeyTokenSignature = context.getConfiguration().get( + HCatConstants.HCAT_KEY_TOKEN_SIGNATURE); + if (tokenStrForm != null + && hCatKeyTokenSignature != null) { + LOG.info("S3OutputCommitterContainer::cancelDelegationTokens(): " + + "Cancelling token fetched for HCAT_KEY_TOKEN_SIGNATURE == (" + hCatKeyTokenSignature + ")."); + client.cancelDelegationToken(tokenStrForm); + } + else { + LOG.info("S3OutputCommitterContainer::cancelDelegationTokens(): " + + "Could not find tokenStrForm, or HCAT_KEY_TOKEN_SIGNATURE. Skipping token cancellation."); + } + } catch (MetaException e) { + LOG.warn("MetaException while cancelling delegation token.", e); + } catch (TException e) { + LOG.warn("TException while cancelling delegation token.", e); + } finally { + HCatUtil.closeHiveClientQuietly(client); + } + } + + +} diff --git a/hcatalog/core/src/main/java/org/apache/hive/hcatalog/mapreduce/s3/commit/magic/MagicS3GuardCommitter.java b/hcatalog/core/src/main/java/org/apache/hive/hcatalog/mapreduce/s3/commit/magic/MagicS3GuardCommitter.java new file mode 100644 index 000000000000..5c5cfb3db6a8 --- /dev/null +++ b/hcatalog/core/src/main/java/org/apache/hive/hcatalog/mapreduce/s3/commit/magic/MagicS3GuardCommitter.java @@ -0,0 +1,86 @@ +package org.apache.hive.hcatalog.mapreduce.s3.commit.magic; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.s3a.commit.magic.MagicS3GuardCommitterFactory; +import org.apache.hadoop.mapred.*; +import org.apache.hadoop.mapreduce.TaskAttemptContext; +import org.apache.hadoop.fs.s3a.commit.CommitConstants; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +/** + * This is a dedicated committer which requires the "magic" directory feature + * of the S3A Filesystem to be enabled; it then uses paths for task and job + * attempts in magic paths, so as to ensure that the final output goes direct + * to the destination directory. + */ +@InterfaceAudience.Public +@InterfaceStability.Unstable +public class MagicS3GuardCommitter extends OutputCommitter { + private static final Logger LOG = + LoggerFactory.getLogger(MagicS3GuardCommitter.class); + + /** Name: {@value}. */ + public static final String NAME = CommitConstants.COMMITTER_NAME_MAGIC; + + org.apache.hadoop.fs.s3a.commit.magic.MagicS3GuardCommitter committer = null; + public MagicS3GuardCommitter() { + } //necessary only for mapred API reasons, should never be actually used + + public MagicS3GuardCommitter(Path outputPath, TaskAttemptContext context) throws IOException { + committer = (org.apache.hadoop.fs.s3a.commit.magic.MagicS3GuardCommitter) MagicS3GuardCommitterFactory.createCommitter(outputPath, context); + } + + private org.apache.hadoop.fs.s3a.commit.magic.MagicS3GuardCommitter getWrapped(JobContext context) throws IOException { + //Hadoop's committer only supports being created with a TaskAttemptContext, so we create a dummy instance for it since we only care about the job in this case + TaskAttemptContext tac = new org.apache.hadoop.mapred.TaskAttemptContextImpl(context.getJobConf(), new TaskAttemptID(context.getJobID().getJtIdentifier(), context.getJobID().getId(), false,0,0)); + if (committer == null) { + committer = (org.apache.hadoop.fs.s3a.commit.magic.MagicS3GuardCommitter) MagicS3GuardCommitterFactory.createCommitter(new Path(context.getConfiguration().get("mapred.output.dir")), tac); + } + return committer; + } + + private org.apache.hadoop.fs.s3a.commit.magic.MagicS3GuardCommitter getWrapped(TaskAttemptContext context) throws IOException { + if (committer == null) { + committer = (org.apache.hadoop.fs.s3a.commit.magic.MagicS3GuardCommitter) MagicS3GuardCommitterFactory.createCommitter(new Path(context.getConfiguration().get("mapred.output.dir")), context); + } + return committer; + } + + @Override + public void setupJob(JobContext context) throws IOException { + getWrapped(context).setupJob(context); + } + + @Override + public void setupTask(org.apache.hadoop.mapred.TaskAttemptContext context) throws IOException { + getWrapped(context).setupTask(context); + } + + @Override + public boolean needsTaskCommit(org.apache.hadoop.mapred.TaskAttemptContext context) throws IOException { + return getWrapped(context).needsTaskCommit(context); + } + + @Override + public void commitTask(org.apache.hadoop.mapred.TaskAttemptContext context) throws IOException { + getWrapped(context).commitTask(context); + } + + @Override + public void abortTask(org.apache.hadoop.mapred.TaskAttemptContext context) throws IOException { + getWrapped(context).abortTask(context); + } + + public void commitJob(org.apache.hadoop.mapred.JobContext context) throws IOException { + getWrapped(context).commitJob(context); + } + + public final Path getWorkPath() { + return committer.getWorkPath(); + } +} + diff --git a/hcatalog/core/src/test/java/org/apache/hive/hcatalog/common/TestHiveClientCache.java b/hcatalog/core/src/test/java/org/apache/hive/hcatalog/common/TestHiveClientCache.java index fe1d8afdc8bd..a0b276650382 100644 --- a/hcatalog/core/src/test/java/org/apache/hive/hcatalog/common/TestHiveClientCache.java +++ b/hcatalog/core/src/test/java/org/apache/hive/hcatalog/common/TestHiveClientCache.java @@ -167,7 +167,7 @@ public void testCloseAllClients() throws IOException, MetaException, LoginExcept /** * Test that a long table name actually breaks the HMSC. Subsequently check that isOpen() reflects * and tells if the client is broken - */ + *//* @Ignore("hangs indefinitely") @Test public void testHMSCBreakability() throws IOException, MetaException, LoginException, TException, AlreadyExistsException, @@ -215,7 +215,7 @@ public void testHMSCBreakability() throws IOException, MetaException, LoginExcep assertFalse(client.isOpen()); metaServer.shutDown(); - } + }*/ private static class LocalMetaServer implements Runnable { public final int MS_PORT = 20101; diff --git a/hcatalog/pom.xml b/hcatalog/pom.xml index f6f302c709f9..fabf978cff5c 100644 --- a/hcatalog/pom.xml +++ b/hcatalog/pom.xml @@ -65,6 +65,11 @@ ${hadoop.version} test + + org.apache.hadoop + hadoop-aws + ${hadoop.version} + org.apache.pig pig diff --git a/hcatalog/webhcat/svr/src/test/java/org/apache/hive/hcatalog/templeton/mock/MockUriInfo.java b/hcatalog/webhcat/svr/src/test/java/org/apache/hive/hcatalog/templeton/mock/MockUriInfo.java index 1cc0d8775567..415e5763c33a 100644 --- a/hcatalog/webhcat/svr/src/test/java/org/apache/hive/hcatalog/templeton/mock/MockUriInfo.java +++ b/hcatalog/webhcat/svr/src/test/java/org/apache/hive/hcatalog/templeton/mock/MockUriInfo.java @@ -136,13 +136,11 @@ public UriBuilder getRequestUriBuilder() { return null; } - @Override public URI relativize(URI uri) { // TODO Auto-generated method stub return null; } - @Override public URI resolve(URI uri) { // TODO Auto-generated method stub return null; diff --git a/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/history/TestHiveHistory.java b/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/history/TestHiveHistory.java index 9b50fd4f3061..a8f5d0162c20 100644 --- a/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/history/TestHiveHistory.java +++ b/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/history/TestHiveHistory.java @@ -37,6 +37,7 @@ import org.apache.hadoop.hive.metastore.Warehouse; import org.apache.hadoop.hive.ql.DriverFactory; import org.apache.hadoop.hive.ql.IDriver; +import org.apache.hadoop.hive.ql.exec.HiveDataCommitter; import org.apache.hadoop.hive.ql.history.HiveHistory.Keys; import org.apache.hadoop.hive.ql.history.HiveHistory.QueryInfo; import org.apache.hadoop.hive.ql.history.HiveHistory.TaskInfo; @@ -107,7 +108,7 @@ protected void setUp() { db.createTable(src, cols, null, TextInputFormat.class, IgnoreKeyTextOutputFormat.class); db.loadTable(hadoopDataFile[i], src, - LoadFileType.KEEP_EXISTING, false, false, false, false, null, 0, false); + LoadFileType.KEEP_EXISTING, false, false, false, false, null, 0, false, new HiveDataCommitter()); i++; } diff --git a/ql/pom.xml b/ql/pom.xml index db53950dc53e..e56d4ae7c2a9 100644 --- a/ql/pom.xml +++ b/ql/pom.xml @@ -206,6 +206,11 @@ ${hadoop.version} true + + org.apache.hadoop + hadoop-aws + ${hadoop.version} + org.apache.hadoop hadoop-yarn-registry diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/DataCommitter.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/DataCommitter.java new file mode 100644 index 000000000000..d51ec8d6d9bd --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/DataCommitter.java @@ -0,0 +1,34 @@ +package org.apache.hadoop.hive.ql.exec; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathFilter; +import org.apache.hadoop.hive.common.classification.InterfaceAudience; +import org.apache.hadoop.hive.common.classification.InterfaceStability; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.metadata.Hive; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.session.SessionState; + +import java.util.List; + + +/** + * Defines how Hive will commit data to its final directory. + */ +@InterfaceAudience.Private +@InterfaceStability.Unstable +public interface DataCommitter { + + void moveFile(Path sourcePath, Path targetPath, boolean isDfsDir, HiveConf conf, + SessionState.LogHelper console) throws HiveException; + + void copyFiles(HiveConf conf, Path srcf, Path destf, FileSystem fs, boolean isSrcLocal, + boolean isAcidIUD, boolean isOverwrite, List newFiles, boolean isBucketed, + boolean isFullAcidTable, boolean isManaged) throws HiveException; + + void replaceFiles(Path tablePath, Path srcf, Path destf, Path oldPath, HiveConf conf, + boolean isSrcLocal, boolean purge, List newFiles, + PathFilter deletePathFilter, boolean isNeedRecycle, boolean isManaged, + Hive hive) throws HiveException; +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java index 2a74f86bb523..8dd636bca079 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java @@ -85,6 +85,14 @@ import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapreduce.TaskAttemptContext; +import org.apache.hadoop.mapreduce.TaskAttemptID; +import org.apache.hadoop.mapreduce.TaskType; +import org.apache.hadoop.hive.ql.exec.tez.TezContext; +import org.apache.tez.runtime.api.ProcessorContext; +import org.apache.hadoop.mapreduce.lib.output.PathOutputCommitter; +import org.apache.hadoop.mapreduce.lib.output.PathOutputCommitterFactory; +import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl; import org.apache.hadoop.util.ReflectionUtils; import org.apache.hive.common.util.HiveStringUtils; import org.apache.hive.common.util.Murmur3; @@ -148,6 +156,11 @@ public class FileSinkOperator extends TerminalOperator implements private transient boolean isInsertOverwrite; private transient String counterGroup; private transient BiFunction hashFunc; + private transient PathOutputCommitter pathOutputCommitter; + private transient TaskAttemptContext taskAttemptContext; + + public static final String TOTAL_TABLE_ROWS_WRITTEN = "TOTAL_TABLE_ROWS_WRITTEN"; + /** * Counters. */ @@ -249,7 +262,7 @@ private void commitOneOutPath(int idx, FileSystem fs, List commitPaths) } FileUtils.mkdir(fs, finalPaths[idx].getParent(), hconf); } - if(outPaths[idx] != null && fs.exists(outPaths[idx])) { + if(pathOutputCommitter == null && outPaths[idx] != null && fs.exists(outPaths[idx])) { if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) { Utilities.FILE_OP_LOGGER.trace("committing " + outPaths[idx] + " to " + finalPaths[idx] + " (" + isMmTable + ")"); @@ -273,6 +286,13 @@ private void commitOneOutPath(int idx, FileSystem fs, List commitPaths) } } + if (pathOutputCommitter != null && outPaths[idx] != null && + outPaths[idx].getFileSystem(hconf).exists(outPaths[idx])) { + if (pathOutputCommitter.needsTaskCommit(taskAttemptContext)) { + pathOutputCommitter.commitTask(taskAttemptContext); + } + } + updateProgress(); } @@ -294,7 +314,7 @@ public void abortWriters(FileSystem fs, boolean abort, boolean delete) throws Hi } public void initializeBucketPaths(int filesIdx, String taskId, boolean isNativeTable, - boolean isSkewedStoredAsSubDirectories) { + boolean isSkewedStoredAsSubDirectories) throws IOException { if (isNativeTable) { String extension = Utilities.getFileExtension(jc, isCompressed, hiveOutputFormat); String taskWithExt = extension == null ? taskId : taskId + extension; @@ -304,7 +324,11 @@ public void initializeBucketPaths(int filesIdx, String taskId, boolean isNativeT } else { finalPaths[filesIdx] = new Path(buildTmpPath(), taskWithExt); } - outPaths[filesIdx] = new Path(buildTaskOutputTempPath(), Utilities.toTempPath(taskId)); + if (pathOutputCommitter != null) { + outPaths[filesIdx] = getPathOutputCommitterFile(taskId); + } else { + outPaths[filesIdx] = new Path(buildTaskOutputTempPath(), Utilities.toTempPath(taskId)); + } } else { String taskIdPath = taskId; if (conf.isMerge()) { @@ -534,6 +558,13 @@ protected void initializeOp(Configuration hconf) throws HiveException { destTablePath = conf.getDestPath(); isInsertOverwrite = conf.getInsertOverwrite(); counterGroup = HiveConf.getVar(hconf, HiveConf.ConfVars.HIVECOUNTERGROUP); + + if (conf.getHasOutputCommitter()) { + taskAttemptContext = createTaskAttemptContext(); + pathOutputCommitter = createPathOutputCommitter(); + pathOutputCommitter.setupTask(taskAttemptContext); + } + if (LOG.isInfoEnabled()) { LOG.info("Using serializer : " + serializer + " and formatter : " + hiveOutputFormat + (isCompressed ? " with compression" : "")); @@ -733,9 +764,9 @@ protected void createBucketFiles(FSPaths fsp) throws HiveException { } assert filesIdx == numFiles; - // in recent hadoop versions, use deleteOnExit to clean tmp files. + // in recent hadoop versions, use deleteOnExit to clean tmp files.h if (isNativeTable() && fs != null && fsp != null && !conf.isMmTable()) { - autoDelete = fs.deleteOnExit(fsp.outPaths[0]); + autoDelete = fsp.outPaths[0].getFileSystem(hconf).deleteOnExit(fsp.outPaths[0]); } } catch (Exception e) { e.printStackTrace(); @@ -760,7 +791,7 @@ protected void createBucketForFileIdx(FSPaths fsp, int filesIdx) if (isNativeTable() && !conf.isMmTable()) { // in recent hadoop versions, use deleteOnExit to clean tmp files. - autoDelete = fs.deleteOnExit(fsp.outPaths[filesIdx]); + autoDelete = fsp.outPaths[filesIdx].getFileSystem(hconf).deleteOnExit(fsp.outPaths[filesIdx]); } updateDPCounters(fsp, filesIdx); @@ -1593,4 +1624,64 @@ private boolean isNativeTable() { return !conf.getTableInfo().isNonNative(); } + private PathOutputCommitter createPathOutputCommitter() throws IOException { + return PathOutputCommitterFactory.createCommitter(new Path(conf.getTargetDirName()), + taskAttemptContext); + } + + private TaskAttemptContextImpl createTaskAttemptContext() { + // Get task and attempt info from context if available + int taskId = 0; + int attemptId = 0; + TaskType taskType = TaskType.MAP; + + MapredContext ctx = MapredContext.get(); + TaskAttemptID origId = ctx != null ? ctx.getTaskAttemptID() : null; + if (origId != null) { + taskId = origId.getTaskID().getId(); + attemptId = origId.getId(); + taskType = origId.getTaskType(); + } else if (ctx instanceof TezContext) { + // In Tez, MapredContext.getTaskAttemptID() returns null, but we can get + // the unique task index from TezContext's ProcessorContext + TezContext tezCtx = (TezContext) ctx; + ProcessorContext procCtx = tezCtx.getTezProcessorContext(); + if (procCtx != null) { + taskId = procCtx.getTaskIndex(); + attemptId = procCtx.getTaskAttemptNumber(); + // Tez reducers should use REDUCE type for proper path separation + taskType = tezCtx.isMap() ? TaskType.MAP : TaskType.REDUCE; + LOG.info("Using Tez ProcessorContext for TaskAttemptID: taskIndex={}, attemptNumber={}, isMap={}", + taskId, attemptId, tezCtx.isMap()); + } + } + + // IMPORTANT: Use empty jtIdentifier ("") and job id 0 to match what PathOutputCommitterResolver + // uses during planning. This ensures the magic committer's __magic_job-{id} paths are consistent + // between task writes and job commit. See HADOOP-19091. + // PathOutputCommitterResolver sets hive.magic.committer.job.id to "job__0000" (JobID("", 0)) + TaskAttemptID taskAttemptID = new TaskAttemptID(org.apache.commons.lang.StringUtils.EMPTY, 0, + taskType, taskId, attemptId); + + // We want the committer to ignore the application attempt id because there is no way to know + // the correct value during job commit, so we force the committer to always use the default + // This is safe because Hive tasks are deterministic and different application attempts will + // always write to the same file + hconf.unset("mapreduce.job.application.attempt.id"); + + return new TaskAttemptContextImpl(hconf, taskAttemptID); + } + + /** + * Get the {@link Path} to the file that Hive will write to. This file will be under the + * working path of the {@link PathOutputCommitter}. The name of the file is a combination of + * the current task ID and the Hive query ID. The reason the query ID is appended to the + * filename is to ensure uniqueness of the files. Since the committer is writing directly to + * the target table, its possible data for that able already exists. By appending the query ID, + * we ensure that the existing files never get overwritten. + */ + private Path getPathOutputCommitterFile(String taskId) throws IOException { + return new Path(pathOutputCommitter.getWorkPath(), + taskId + "-" + hconf.get(ConfVars.HIVEQUERYID.varname)); + } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/HiveDataCommitter.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/HiveDataCommitter.java new file mode 100644 index 000000000000..c53f36fd8a01 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/HiveDataCommitter.java @@ -0,0 +1,836 @@ +package org.apache.hadoop.hive.ql.exec; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.util.concurrent.ThreadFactoryBuilder; + +import org.apache.commons.io.FilenameUtils; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Options; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathFilter; +import org.apache.hadoop.fs.permission.FsAction; +import org.apache.hadoop.hdfs.DistributedFileSystem; +import org.apache.hadoop.hive.common.FileUtils; +import org.apache.hadoop.hive.common.ObjectPair; +import org.apache.hadoop.hive.common.classification.InterfaceAudience; +import org.apache.hadoop.hive.common.classification.InterfaceStability; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.io.HdfsUtils; +import org.apache.hadoop.hive.metastore.api.CmRecycleRequest; +import org.apache.hadoop.hive.ql.ErrorMsg; +import org.apache.hadoop.hive.ql.log.PerfLogger; +import org.apache.hadoop.hive.ql.metadata.Hive; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.MoveWork; +import org.apache.hadoop.hive.ql.session.SessionState; +import org.apache.hadoop.hive.shims.HadoopShims; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.hadoop.util.StringUtils; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; + +/** + * A {@link DataCommitter} that commits Hive data using a {@link FileSystem}. + */ +@InterfaceAudience.Private +@InterfaceStability.Unstable +public class HiveDataCommitter implements DataCommitter { + + private static final Logger LOG = LoggerFactory.getLogger(DataCommitter.class.getName()); + + private MoveWork work; + + HiveDataCommitter(MoveWork moveWork) { + this.work = moveWork; + } + + @VisibleForTesting + public HiveDataCommitter() { + // Do nothing + } + + @Override + public void moveFile(Path sourcePath, Path targetPath, boolean isDfsDir, HiveConf conf, + SessionState.LogHelper console) throws HiveException { + try { + PerfLogger perfLogger = SessionState.getPerfLogger(); + perfLogger.PerfLogBegin("MoveTask", PerfLogger.FILE_MOVES); + + String mesg = "Moving data to " + (isDfsDir ? "" : "local ") + "directory " + + targetPath.toString(); + String mesg_detail = " from " + sourcePath.toString(); + console.printInfo(mesg, mesg_detail); + + FileSystem fs = sourcePath.getFileSystem(conf); + if (isDfsDir) { + moveFileInDfs (sourcePath, targetPath, conf); + } else { + // This is a local file + FileSystem dstFs = FileSystem.getLocal(conf); + moveFileFromDfsToLocal(sourcePath, targetPath, fs, dstFs, conf); + } + + perfLogger.PerfLogEnd("MoveTask", PerfLogger.FILE_MOVES); + } catch (Exception e) { + throw new HiveException("Unable to move source " + sourcePath + " to destination " + + targetPath, e); + } + } + + private void moveFileInDfs (Path sourcePath, Path targetPath, HiveConf conf) + throws HiveException, IOException { + + final FileSystem srcFs, tgtFs; + try { + tgtFs = targetPath.getFileSystem(conf); + } catch (IOException e) { + LOG.error("Failed to get dest fs", e); + throw new HiveException(e.getMessage(), e); + } + try { + srcFs = sourcePath.getFileSystem(conf); + } catch (IOException e) { + LOG.error("Failed to get src fs", e); + throw new HiveException(e.getMessage(), e); + } + + // if source exists, rename. Otherwise, create a empty directory + if (srcFs.exists(sourcePath)) { + Path deletePath = null; + // If it multiple level of folder are there fs.rename is failing so first + // create the targetpath.getParent() if it not exist + if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_INSERT_INTO_MULTILEVEL_DIRS)) { + deletePath = createTargetPath(targetPath, tgtFs); + } + //For acid table incremental replication, just copy the content of staging directory to destination. + //No need to clean it. + if (work.isNeedCleanTarget()) { + Hive.clearDestForSubDirSrc(conf, targetPath, sourcePath, false); + } + // Set isManaged to false as this is not load data operation for which it is needed. + if (!moveFile(conf, sourcePath, targetPath, true, false, false)) { + try { + if (deletePath != null) { + tgtFs.delete(deletePath, true); + } + } catch (IOException e) { + LOG.info("Unable to delete the path created for facilitating rename: {}", + deletePath); + } + throw new HiveException("Unable to rename: " + sourcePath + + " to: " + targetPath); + } + } else if (!tgtFs.mkdirs(targetPath)) { + throw new HiveException("Unable to make directory: " + targetPath); + } + } + + private void moveFileFromDfsToLocal(Path sourcePath, Path targetPath, FileSystem fs, + FileSystem dstFs, + Configuration conf) throws HiveException, IOException { + // RawLocalFileSystem seems not able to get the right permissions for a local file, it + // always returns hdfs default permission (00666). So we can not overwrite a directory + // by deleting and recreating the directory and restoring its permissions. We should + // delete all its files and subdirectories instead. + if (dstFs.exists(targetPath)) { + if (dstFs.isDirectory(targetPath)) { + FileStatus[] destFiles = dstFs.listStatus(targetPath); + for (FileStatus destFile : destFiles) { + if (!dstFs.delete(destFile.getPath(), true)) { + throw new IOException("Unable to clean the destination directory: " + targetPath); + } + } + } else { + throw new HiveException("Target " + targetPath + " is not a local directory."); + } + } else { + if (!FileUtils.mkdir(dstFs, targetPath, conf)) { + throw new HiveException("Failed to create local target directory " + targetPath); + } + } + + if (fs.exists(sourcePath)) { + FileStatus[] srcs = fs.listStatus(sourcePath, FileUtils.HIDDEN_FILES_PATH_FILTER); + for (FileStatus status : srcs) { + fs.copyToLocalFile(status.getPath(), targetPath); + } + } + } + + private Path createTargetPath(Path targetPath, FileSystem fs) throws IOException { + Path deletePath = null; + Path mkDirPath = targetPath.getParent(); + if (mkDirPath != null && !fs.exists(mkDirPath)) { + Path actualPath = mkDirPath; + // targetPath path is /x/y/z/1/2/3 here /x/y/z is present in the file system + // create the structure till /x/y/z/1/2 to work rename for multilevel directory + // and if rename fails delete the path /x/y/z/1 + // If targetPath have multilevel directories like /x/y/z/1/2/3 , /x/y/z/1/2/4 + // the renaming of the directories are not atomic the execution will happen one + // by one + while (actualPath != null && !fs.exists(actualPath)) { + deletePath = actualPath; + actualPath = actualPath.getParent(); + } + fs.mkdirs(mkDirPath); + } + return deletePath; + } + + //it is assumed that parent directory of the destf should already exist when this + //method is called. when the replace value is true, this method works a little different + //from mv command if the destf is a directory, it replaces the destf instead of moving under + //the destf. in this case, the replaced destf still preserves the original destf's permission + private boolean moveFile(HiveConf conf, Path srcf, Path destf, boolean replace, + boolean isSrcLocal, boolean isManaged) throws HiveException { + final FileSystem srcFs, destFs; + try { + destFs = destf.getFileSystem(conf); + } catch (IOException e) { + LOG.error("Failed to get dest fs", e); + throw new HiveException(e.getMessage(), e); + } + try { + srcFs = srcf.getFileSystem(conf); + } catch (IOException e) { + LOG.error("Failed to get src fs", e); + throw new HiveException(e.getMessage(), e); + } + + HdfsUtils.HadoopFileStatus destStatus = null; + String configuredOwner = HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_LOAD_DATA_OWNER); + + // If source path is a subdirectory of the destination path (or the other way around): + // ex: INSERT OVERWRITE DIRECTORY 'target/warehouse/dest4.out' SELECT src.value WHERE src.key >= 300; + // where the staging directory is a subdirectory of the destination directory + // (1) Do not delete the dest dir before doing the move operation. + // (2) It is assumed that subdir and dir are in same encryption zone. + // (3) Move individual files from scr dir to dest dir. + boolean srcIsSubDirOfDest = Hive.isSubDir(srcf, destf, srcFs, destFs, isSrcLocal), + destIsSubDirOfSrc = Hive.isSubDir(destf, srcf, destFs, srcFs, false); + final String msg = "Unable to move source " + srcf + " to destination " + destf; + try { + if (replace) { + try{ + destStatus = new HdfsUtils.HadoopFileStatus(conf, destFs, destf); + //if destf is an existing directory: + //if replace is true, delete followed by rename(mv) is equivalent to replace + //if replace is false, rename (mv) actually move the src under dest dir + //if destf is an existing file, rename is actually a replace, and do not need + // to delete the file first + if (replace && !srcIsSubDirOfDest) { + destFs.delete(destf, true); + LOG.debug("The path " + destf.toString() + " is deleted"); + } + } catch (FileNotFoundException ignore) { + } + } + final HdfsUtils.HadoopFileStatus desiredStatus = destStatus; + final SessionState parentSession = SessionState.get(); + if (isSrcLocal) { + // For local src file, copy to hdfs + destFs.copyFromLocalFile(srcf, destf); + return true; + } else { + if (needToCopy(srcf, destf, srcFs, destFs, configuredOwner, isManaged)) { + //copy if across file system or encryption zones. + LOG.debug("Copying source " + srcf + " to " + destf + " because HDFS encryption zones are different."); + return FileUtils.copy(srcf.getFileSystem(conf), srcf, destf.getFileSystem(conf), destf, + true, // delete source + replace, // overwrite destination + conf); + } else { + if (srcIsSubDirOfDest || destIsSubDirOfSrc) { + FileStatus[] srcs = destFs.listStatus(srcf, FileUtils.HIDDEN_FILES_PATH_FILTER); + + List> futures = new LinkedList<>(); + final ExecutorService pool = conf.getInt(HiveConf.ConfVars.HIVE_MOVE_FILES_THREAD_COUNT.varname, 25) > 0 ? + Executors.newFixedThreadPool(conf.getInt(HiveConf.ConfVars.HIVE_MOVE_FILES_THREAD_COUNT.varname, 25), + new ThreadFactoryBuilder().setDaemon(true).setNameFormat("Move-Thread-%d").build()) : null; + if (destIsSubDirOfSrc && !destFs.exists(destf)) { + if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) { + Utilities.FILE_OP_LOGGER.trace("Creating " + destf); + } + destFs.mkdirs(destf); + } + /* Move files one by one because source is a subdirectory of destination */ + for (final FileStatus srcStatus : srcs) { + + final Path destFile = new Path(destf, srcStatus.getPath().getName()); + + final String poolMsg = + "Unable to move source " + srcStatus.getPath() + " to destination " + destFile; + + if (null == pool) { + boolean success = false; + if (destFs instanceof DistributedFileSystem) { + ((DistributedFileSystem)destFs).rename(srcStatus.getPath(), destFile, Options.Rename.OVERWRITE); + success = true; + } else { + destFs.delete(destFile, false); + success = destFs.rename(srcStatus.getPath(), destFile); + } + if(!success) { + throw new IOException("rename for src path: " + srcStatus.getPath() + " to dest:" + + destf + " returned false"); + } + } else { + futures.add(pool.submit(new Callable() { + @Override + public Void call() throws HiveException { + SessionState.setCurrentSessionState(parentSession); + try { + boolean success = false; + if (destFs instanceof DistributedFileSystem) { + ((DistributedFileSystem)destFs).rename(srcStatus.getPath(), destFile, Options.Rename.OVERWRITE); + success = true; + } else { + destFs.delete(destFile, false); + success = destFs.rename(srcStatus.getPath(), destFile); + } + if (!success) { + throw new IOException( + "rename for src path: " + srcStatus.getPath() + " to dest path:" + + destFile + " returned false"); + } + } catch (Exception e) { + throw Hive.getHiveException(e, poolMsg); + } + return null; + } + })); + } + } + if (null != pool) { + pool.shutdown(); + for (Future future : futures) { + try { + future.get(); + } catch (Exception e) { + throw handlePoolException(pool, e); + } + } + } + return true; + } else { + if (destFs.rename(srcf, destf)) { + return true; + } + return false; + } + } + } + } catch (Exception e) { + throw Hive.getHiveException(e, msg); + } + } + + /** + * Copy files. This handles building the mapping for buckets and such between the source and + * destination + * @param conf Configuration object + * @param srcf source directory, if bucketed should contain bucket files + * @param destf directory to move files into + * @param fs Filesystem + * @param isSrcLocal true if source is on local file system + * @param isAcidIUD true if this is an ACID based Insert/Update/Delete + * @param isOverwrite if true, then overwrite if destination file exist, else add a duplicate copy + * @param newFiles if this is non-null, a list of files that were created as a result of this + * move will be returned. + * @param isManaged if table is managed. + * @throws HiveException + */ + @Override + public void copyFiles(HiveConf conf, Path srcf, Path destf, FileSystem fs, + boolean isSrcLocal, boolean isAcidIUD, + boolean isOverwrite, List newFiles, boolean isBucketed, + boolean isFullAcidTable, boolean isManaged) throws HiveException { + try { + // create the destination if it does not exist + if (!fs.exists(destf)) { + FileUtils.mkdir(fs, destf, conf); + } + } catch (IOException e) { + throw new HiveException( + "copyFiles: error while checking/creating destination directory!!!", + e); + } + + FileStatus[] srcs; + FileSystem srcFs; + try { + srcFs = srcf.getFileSystem(conf); + srcs = srcFs.globStatus(srcf); + } catch (IOException e) { + LOG.error(StringUtils.stringifyException(e)); + throw new HiveException("addFiles: filesystem error in check phase. " + e.getMessage(), e); + } + if (srcs == null) { + LOG.info("No sources specified to move: " + srcf); + return; + // srcs = new FileStatus[0]; Why is this needed? + } + + // If we're moving files around for an ACID write then the rules and paths are all different. + // You can blame this on Owen. + if (isAcidIUD) { + Hive.moveAcidFiles(srcFs, srcs, destf, newFiles); + } else { + // For ACID non-bucketed case, the filenames have to be in the format consistent with INSERT/UPDATE/DELETE Ops, + // i.e, like 000000_0, 000001_0_copy_1, 000002_0.gz etc. + // The extension is only maintained for files which are compressed. + copyFiles(conf, fs, srcs, srcFs, destf, isSrcLocal, isOverwrite, + newFiles, isFullAcidTable && !isBucketed, isManaged); + } + } + + private void copyFiles(HiveConf conf, FileSystem destFs, FileStatus[] srcs, + FileSystem srcFs, Path destf, boolean isSrcLocal, + boolean isOverwrite, List newFiles, + boolean acidRename, boolean isManaged) throws HiveException { + final HdfsUtils.HadoopFileStatus fullDestStatus; + try { + fullDestStatus = new HdfsUtils.HadoopFileStatus(conf, destFs, destf); + } catch (IOException e1) { + throw new HiveException(e1); + } + + if (!fullDestStatus.getFileStatus().isDirectory()) { + throw new HiveException(destf + " is not a directory."); + } + final List>> futures = new LinkedList<>(); + final ExecutorService pool = conf.getInt(HiveConf.ConfVars.HIVE_MOVE_FILES_THREAD_COUNT.varname, 25) > 0 ? + Executors.newFixedThreadPool(conf.getInt(HiveConf.ConfVars.HIVE_MOVE_FILES_THREAD_COUNT.varname, 25), + new ThreadFactoryBuilder().setDaemon(true).setNameFormat("Move-Thread-%d").build()) : null; + // For ACID non-bucketed case, the filenames have to be in the format consistent with INSERT/UPDATE/DELETE Ops, + // i.e, like 000000_0, 000001_0_copy_1, 000002_0.gz etc. + // The extension is only maintained for files which are compressed. + int taskId = 0; + // Sort the files + Arrays.sort(srcs); + String configuredOwner = HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_LOAD_DATA_OWNER); + for (FileStatus src : srcs) { + FileStatus[] files; + if (src.isDirectory()) { + try { + files = srcFs.listStatus(src.getPath(), FileUtils.HIDDEN_FILES_PATH_FILTER); + } catch (IOException e) { + if (null != pool) { + pool.shutdownNow(); + } + throw new HiveException(e); + } + } else { + files = new FileStatus[] {src}; + } + + final SessionState parentSession = SessionState.get(); + // Sort the files + Arrays.sort(files); + for (final FileStatus srcFile : files) { + final Path srcP = srcFile.getPath(); + final boolean needToCopy = needToCopy(srcP, destf, srcFs, destFs, configuredOwner, isManaged); + + final boolean isRenameAllowed = !needToCopy && !isSrcLocal; + + final String msg = "Unable to move source " + srcP + " to destination " + destf; + + // If we do a rename for a non-local file, we will be transfering the original + // file permissions from source to the destination. Else, in case of mvFile() where we + // copy from source to destination, we will inherit the destination's parent group ownership. + if (null == pool) { + try { + Path destPath = mvFile(conf, srcFs, srcP, destFs, destf, isSrcLocal, isOverwrite, isRenameAllowed, + acidRename ? taskId++ : -1); + + if (null != newFiles) { + newFiles.add(destPath); + } + } catch (Exception e) { + throw Hive.getHiveException(e, msg, "Failed to move: {}"); + } + } else { + // future only takes final or seemingly final values. Make a final copy of taskId + final int finalTaskId = acidRename ? taskId++ : -1; + futures.add(pool.submit(new Callable>() { + @Override + public ObjectPair call() throws HiveException { + SessionState.setCurrentSessionState(parentSession); + + try { + Path destPath = + mvFile(conf, srcFs, srcP, destFs, destf, isSrcLocal, isOverwrite, isRenameAllowed, finalTaskId); + + if (null != newFiles) { + newFiles.add(destPath); + } + return ObjectPair.create(srcP, destPath); + } catch (Exception e) { + throw Hive.getHiveException(e, msg); + } + } + })); + } + } + } + if (null != pool) { + pool.shutdown(); + for (Future> future : futures) { + try { + ObjectPair pair = future.get(); + LOG.debug("Moved src: {}, to dest: {}", pair.getFirst().toString(), pair.getSecond().toString()); + } catch (Exception e) { + throw handlePoolException(pool, e); + } + } + } + } + + /** + * Replaces files in the partition with new data set specified by srcf. Works + * by renaming directory of srcf to the destination file. + * srcf, destf, and tmppath should resident in the same DFS, but the oldPath can be in a + * different DFS. + * + * @param tablePath path of the table. Used to identify permission inheritance. + * @param srcf + * Source directory to be renamed to tmppath. It should be a + * leaf directory where the final data files reside. However it + * could potentially contain subdirectories as well. + * @param destf + * The directory where the final data needs to go + * @param oldPath + * The directory where the old data location, need to be cleaned up. Most of time, will be the same + * as destf, unless its across FileSystem boundaries. + * @param purge + * When set to true files which needs to be deleted are not moved to Trash + * @param isSrcLocal + * If the source directory is LOCAL + * @param newFiles + * Output the list of new files replaced in the destination path + * @param isManaged + * If the table is managed. + */ + @Override + public void replaceFiles(Path tablePath, Path srcf, Path destf, Path oldPath, HiveConf conf, + boolean isSrcLocal, boolean purge, List newFiles, PathFilter deletePathFilter, + boolean isNeedRecycle, boolean isManaged, Hive hive) throws HiveException { + try { + + FileSystem destFs = destf.getFileSystem(conf); + // check if srcf contains nested sub-directories + FileStatus[] srcs; + FileSystem srcFs; + try { + srcFs = srcf.getFileSystem(conf); + srcs = srcFs.globStatus(srcf); + } catch (IOException e) { + throw new HiveException("Getting globStatus " + srcf.toString(), e); + } + if (srcs == null) { + LOG.info("No sources specified to move: " + srcf); + return; + } + + if (oldPath != null) { + deleteOldPathForReplace(destf, oldPath, conf, purge, deletePathFilter, isNeedRecycle, hive); + } + + // first call FileUtils.mkdir to make sure that destf directory exists, if not, it creates + // destf + boolean destfExist = FileUtils.mkdir(destFs, destf, conf); + if(!destfExist) { + throw new IOException("Directory " + destf.toString() + + " does not exist and could not be created."); + } + + // Two cases: + // 1. srcs has only a src directory, if rename src directory to destf, we also need to + // Copy/move each file under the source directory to avoid to delete the destination + // directory if it is the root of an HDFS encryption zone. + // 2. srcs must be a list of files -- ensured by LoadSemanticAnalyzer + // in both cases, we move the file under destf + if (srcs.length == 1 && srcs[0].isDirectory()) { + if (!moveFile(conf, srcs[0].getPath(), destf, true, isSrcLocal, isManaged)) { + throw new IOException("Error moving: " + srcf + " into: " + destf); + } + + // Add file paths of the files that will be moved to the destination if the caller needs it + if (null != newFiles) { + listNewFilesRecursively(destFs, destf, newFiles); + } + } else { + // its either a file or glob + for (FileStatus src : srcs) { + Path destFile = new Path(destf, src.getPath().getName()); + if (!moveFile(conf, src.getPath(), destFile, true, isSrcLocal, isManaged)) { + throw new IOException("Error moving: " + srcf + " into: " + destf); + } + + // Add file paths of the files that will be moved to the destination if the caller needs it + if (null != newFiles) { + newFiles.add(destFile); + } + } + } + } catch (IOException e) { + throw new HiveException(e.getMessage(), e); + } + } + + public void deleteOldPathForReplace(Path destPath, Path oldPath, HiveConf conf, boolean purge, + PathFilter pathFilter, boolean isNeedRecycle, Hive hive) throws HiveException { + Utilities.FILE_OP_LOGGER.debug("Deleting old paths for replace in " + destPath + + " and old path " + oldPath); + boolean isOldPathUnderDestf = false; + try { + FileSystem oldFs = oldPath.getFileSystem(conf); + FileSystem destFs = destPath.getFileSystem(conf); + // if oldPath is destf or its subdir, its should definitely be deleted, otherwise its + // existing content might result in incorrect (extra) data. + // But not sure why we changed not to delete the oldPath in HIVE-8750 if it is + // not the destf or its subdir? + isOldPathUnderDestf = Hive.isSubDir(oldPath, destPath, oldFs, destFs, false); + if (isOldPathUnderDestf) { + cleanUpOneDirectoryForReplace(oldPath, oldFs, pathFilter, conf, purge, isNeedRecycle, hive); + } + } catch (IOException e) { + if (isOldPathUnderDestf) { + // if oldPath is a subdir of destf but it could not be cleaned + throw new HiveException("Directory " + oldPath.toString() + + " could not be cleaned up.", e); + } else { + //swallow the exception since it won't affect the final result + LOG.warn("Directory " + oldPath.toString() + " cannot be cleaned: " + e, e); + } + } + } + + private void cleanUpOneDirectoryForReplace(Path path, FileSystem fs, PathFilter pathFilter, + HiveConf conf, boolean purge, boolean isNeedRecycle, + Hive hive) throws IOException, HiveException { + if (isNeedRecycle && conf.getBoolVar(HiveConf.ConfVars.REPLCMENABLED)) { + recycleDirToCmPath(path, purge, hive); + } + FileStatus[] statuses = fs.listStatus(path, pathFilter); + if (statuses == null || statuses.length == 0) { + return; + } + if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) { + String s = "Deleting files under " + path + " for replace: "; + for (FileStatus file : statuses) { + s += file.getPath().getName() + ", "; + } + Utilities.FILE_OP_LOGGER.trace(s); + } + + if (!Hive.trashFiles(fs, statuses, conf, purge)) { + throw new HiveException("Old path " + path + " has not been cleaned up."); + } + } + + /** + * Recycles the files recursively from the input path to the cmroot directory either by copying or moving it. + * + * @param dataPath Path of the data files to be recycled to cmroot + * @param isPurge + * When set to true files which needs to be recycled are not moved to Trash + */ + public void recycleDirToCmPath(Path dataPath, boolean isPurge, Hive hive) throws HiveException { + try { + CmRecycleRequest request = new CmRecycleRequest(dataPath.toString(), isPurge); + hive.getMSC().recycleDirToCmPath(request); + } catch (Exception e) { + throw new HiveException(e); + } + } + + /** + *

+ * Moves a file from one {@link Path} to another. If {@code isRenameAllowed} is true then the + * {@link FileSystem#rename(Path, Path)} method is used to move the file. If its false then the data is copied, if + * {@code isSrcLocal} is true then the {@link FileSystem#copyFromLocalFile(Path, Path)} method is used, else + * {@link FileUtils#copy(FileSystem, Path, FileSystem, Path, boolean, boolean, HiveConf)} is used. + *

+ * + *

+ * If the destination file already exists, then {@code _copy_[counter]} is appended to the file name, where counter + * is an integer starting from 1. + *

+ * + * @param conf the {@link HiveConf} to use if copying data + * @param sourceFs the {@link FileSystem} where the source file exists + * @param sourcePath the {@link Path} to move + * @param destFs the {@link FileSystem} to move the file to + * @param destDirPath the {@link Path} to move the file to + * @param isSrcLocal if the source file is on the local filesystem + * @param isOverwrite if true, then overwrite destination file if exist else make a duplicate copy + * @param isRenameAllowed true if the data should be renamed and not copied, false otherwise + * + * @return the {@link Path} the source file was moved to + * + * @throws IOException if there was an issue moving the file + */ + private static Path mvFile(HiveConf conf, FileSystem sourceFs, Path sourcePath, FileSystem destFs, + Path destDirPath, boolean isSrcLocal, boolean isOverwrite, + boolean isRenameAllowed, int taskId) throws IOException { + + // Strip off the file type, if any so we don't make: + // 000000_0.gz -> 000000_0.gz_copy_1 + final String fullname = sourcePath.getName(); + final String name; + if (taskId == -1) { // non-acid + name = FilenameUtils.getBaseName(sourcePath.getName()); + } else { // acid + name = getPathName(taskId); + } + final String type = FilenameUtils.getExtension(sourcePath.getName()); + + // Incase of ACID, the file is ORC so the extension is not relevant and should not be inherited. + Path destFilePath = new Path(destDirPath, taskId == -1 ? fullname : name); + + /* + * The below loop may perform bad when the destination file already exists and it has too many _copy_ + * files as well. A desired approach was to call listFiles() and get a complete list of files from + * the destination, and check whether the file exists or not on that list. However, millions of files + * could live on the destination directory, and on concurrent situations, this can cause OOM problems. + * + * I'll leave the below loop for now until a better approach is found. + */ + for (int counter = 1; destFs.exists(destFilePath); counter++) { + if (isOverwrite) { + destFs.delete(destFilePath, false); + break; + } + destFilePath = new Path(destDirPath, name + (Utilities.COPY_KEYWORD + counter) + + ((taskId == -1 && !type.isEmpty()) ? "." + type : "")); + } + + if (isRenameAllowed) { + destFs.rename(sourcePath, destFilePath); + } else if (isSrcLocal) { + destFs.copyFromLocalFile(sourcePath, destFilePath); + } else { + FileUtils.copy(sourceFs, sourcePath, destFs, destFilePath, + true, // delete source + false, // overwrite destination + conf); + } + return destFilePath; + } + + /** + * If moving across different FileSystems or differnent encryption zone, need to do a File copy instead of rename. + * TODO- consider if need to do this for different file authority. + * @throws HiveException + */ + static private boolean needToCopy(Path srcf, Path destf, FileSystem srcFs, + FileSystem destFs, String configuredOwner, boolean isManaged) throws HiveException { + //Check if different FileSystems + if (!FileUtils.equalsFileSystem(srcFs, destFs)) { + return true; + } + + if (isManaged && !configuredOwner.isEmpty() && srcFs instanceof DistributedFileSystem) { + // Need some extra checks + // Get the running owner + FileStatus srcs; + + try { + srcs = srcFs.getFileStatus(srcf); + String runningUser = UserGroupInformation.getLoginUser().getShortUserName(); + boolean isOwned = FileUtils.isOwnerOfFileHierarchy(srcFs, srcs, configuredOwner, false); + if (configuredOwner.equals(runningUser)) { + // Check if owner has write permission, else it will have to copy + if (!(isOwned && + FileUtils.isActionPermittedForFileHierarchy( + srcFs, srcs, configuredOwner, FsAction.WRITE, false))) { + return true; + } + } else { + // If the configured owner does not own the file, throw + if (!isOwned) { + throw new HiveException("Load Data failed for " + srcf + " as the file is not owned by " + + configuredOwner + " and load data is also not ran as " + configuredOwner); + } else { + return true; + } + } + } catch (IOException e) { + throw new HiveException("Could not fetch FileStatus for source file"); + } catch (HiveException e) { + throw new HiveException(e); + } catch (Exception e) { + throw new HiveException(" Failed in looking up Permissions on file + " + srcf); + } + } + + //Check if different encryption zones + HadoopShims.HdfsEncryptionShim srcHdfsEncryptionShim = SessionState.get().getHdfsEncryptionShim(srcFs); + HadoopShims.HdfsEncryptionShim destHdfsEncryptionShim = SessionState.get().getHdfsEncryptionShim(destFs); + try { + return srcHdfsEncryptionShim != null + && destHdfsEncryptionShim != null + && (srcHdfsEncryptionShim.isPathEncrypted(srcf) || destHdfsEncryptionShim.isPathEncrypted(destf)) + && !srcHdfsEncryptionShim.arePathsOnSameEncryptionZone(srcf, destf, destHdfsEncryptionShim); + } catch (IOException e) { + throw new HiveException(e); + } + } + + static private HiveException handlePoolException(ExecutorService pool, Exception e) { + HiveException he = null; + + if (e instanceof HiveException) { + he = (HiveException) e; + if (he.getCanonicalErrorMsg() != ErrorMsg.GENERIC_ERROR) { + if (he.getCanonicalErrorMsg() == ErrorMsg.UNRESOLVED_RT_EXCEPTION) { + LOG.error("Failed to move: {}", he.getMessage()); + } else { + LOG.error("Failed to move: {}", he.getRemoteErrorMsg()); + } + } + } else { + LOG.error("Failed to move: {}", e.getMessage()); + he = new HiveException(e.getCause()); + } + pool.shutdownNow(); + return he; + } + + // List the new files in destination path which gets copied from source. + private static void listNewFilesRecursively(final FileSystem destFs, Path dest, + List newFiles) throws HiveException { + try { + for (FileStatus fileStatus : destFs.listStatus(dest, FileUtils.HIDDEN_FILES_PATH_FILTER)) { + if (fileStatus.isDirectory()) { + // If it is a sub-directory, then recursively list the files. + listNewFilesRecursively(destFs, fileStatus.getPath(), newFiles); + } else { + newFiles.add(fileStatus.getPath()); + } + } + } catch (IOException e) { + LOG.error("Failed to get source file statuses", e); + throw new HiveException(e.getMessage(), e); + } + } + + private static String getPathName(int taskId) { + return Utilities.replaceTaskId("000000", taskId) + "_0"; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/MapredContext.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/MapredContext.java index 09cbf32f9c98..683db2a25080 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/MapredContext.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/MapredContext.java @@ -26,6 +26,7 @@ import org.apache.hadoop.hive.common.classification.InterfaceAudience; import org.apache.hadoop.hive.common.classification.InterfaceStability; +import org.apache.hadoop.mapreduce.TaskAttemptID; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.conf.HiveConf; @@ -73,6 +74,7 @@ public static void close() { private final List udfs; private Reporter reporter; + private TaskAttemptID taskAttemptID; protected MapredContext(boolean isMap, JobConf jobConf) { this.isMap = isMap; @@ -105,6 +107,14 @@ public void setReporter(Reporter reporter) { this.reporter = reporter; } + public TaskAttemptID getTaskAttemptID() { + return this.taskAttemptID; + } + + public void setTaskAttemptID(TaskAttemptID taskAttemptID) { + this.taskAttemptID = taskAttemptID; + } + private void registerCloseable(Closeable closeable) { udfs.add(closeable); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/MoveTask.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/MoveTask.java index 2bb3ec4a5a59..89a3165ec6e4 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/MoveTask.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/MoveTask.java @@ -85,123 +85,6 @@ public MoveTask() { super(); } - private void moveFile(Path sourcePath, Path targetPath, boolean isDfsDir) - throws HiveException { - try { - String mesg = "Moving data to " + (isDfsDir ? "" : "local ") + "directory " - + targetPath.toString(); - String mesg_detail = " from " + sourcePath.toString(); - console.printInfo(mesg, mesg_detail); - - FileSystem fs = sourcePath.getFileSystem(conf); - if (isDfsDir) { - moveFileInDfs (sourcePath, targetPath, conf); - } else { - // This is a local file - FileSystem dstFs = FileSystem.getLocal(conf); - moveFileFromDfsToLocal(sourcePath, targetPath, fs, dstFs); - } - } catch (Exception e) { - throw new HiveException("Unable to move source " + sourcePath + " to destination " - + targetPath, e); - } - } - - private void moveFileInDfs (Path sourcePath, Path targetPath, HiveConf conf) - throws HiveException, IOException { - - final FileSystem srcFs, tgtFs; - try { - tgtFs = targetPath.getFileSystem(conf); - } catch (IOException e) { - LOG.error("Failed to get dest fs", e); - throw new HiveException(e.getMessage(), e); - } - try { - srcFs = sourcePath.getFileSystem(conf); - } catch (IOException e) { - LOG.error("Failed to get src fs", e); - throw new HiveException(e.getMessage(), e); - } - - // if source exists, rename. Otherwise, create a empty directory - if (srcFs.exists(sourcePath)) { - Path deletePath = null; - // If it multiple level of folder are there fs.rename is failing so first - // create the targetpath.getParent() if it not exist - if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_INSERT_INTO_MULTILEVEL_DIRS)) { - deletePath = createTargetPath(targetPath, tgtFs); - } - Hive.clearDestForSubDirSrc(conf, targetPath, sourcePath, false); - // Set isManaged to false as this is not load data operation for which it is needed. - if (!Hive.moveFile(conf, sourcePath, targetPath, true, false, false)) { - try { - if (deletePath != null) { - tgtFs.delete(deletePath, true); - } - } catch (IOException e) { - LOG.info("Unable to delete the path created for facilitating rename: {}", - deletePath); - } - throw new HiveException("Unable to rename: " + sourcePath - + " to: " + targetPath); - } - } else if (!tgtFs.mkdirs(targetPath)) { - throw new HiveException("Unable to make directory: " + targetPath); - } - } - - private void moveFileFromDfsToLocal(Path sourcePath, Path targetPath, FileSystem fs, - FileSystem dstFs) throws HiveException, IOException { - // RawLocalFileSystem seems not able to get the right permissions for a local file, it - // always returns hdfs default permission (00666). So we can not overwrite a directory - // by deleting and recreating the directory and restoring its permissions. We should - // delete all its files and subdirectories instead. - if (dstFs.exists(targetPath)) { - if (dstFs.isDirectory(targetPath)) { - FileStatus[] destFiles = dstFs.listStatus(targetPath); - for (FileStatus destFile : destFiles) { - if (!dstFs.delete(destFile.getPath(), true)) { - throw new IOException("Unable to clean the destination directory: " + targetPath); - } - } - } else { - throw new HiveException("Target " + targetPath + " is not a local directory."); - } - } else { - if (!FileUtils.mkdir(dstFs, targetPath, conf)) { - throw new HiveException("Failed to create local target directory " + targetPath); - } - } - - if (fs.exists(sourcePath)) { - FileStatus[] srcs = fs.listStatus(sourcePath, FileUtils.HIDDEN_FILES_PATH_FILTER); - for (FileStatus status : srcs) { - fs.copyToLocalFile(status.getPath(), targetPath); - } - } - } - - private Path createTargetPath(Path targetPath, FileSystem fs) throws IOException { - Path deletePath = null; - Path mkDirPath = targetPath.getParent(); - if (mkDirPath != null && !fs.exists(mkDirPath)) { - Path actualPath = mkDirPath; - // targetPath path is /x/y/z/1/2/3 here /x/y/z is present in the file system - // create the structure till /x/y/z/1/2 to work rename for multilevel directory - // and if rename fails delete the path /x/y/z/1 - // If targetPath have multilevel directories like /x/y/z/1/2/3 , /x/y/z/1/2/4 - // the renaming of the directories are not atomic the execution will happen one - // by one - while (actualPath != null && !fs.exists(actualPath)) { - deletePath = actualPath; - actualPath = actualPath.getParent(); - } - fs.mkdirs(mkDirPath); - } - return deletePath; - } - // Release all the locks acquired for this object // This becomes important for multi-table inserts when one branch may take much more // time than the others. It is better to release the lock for this particular insert. @@ -283,6 +166,15 @@ public int execute(DriverContext driverContext) { } Hive db = getHive(); + + DataCommitter dataCommitter; + if (work.getPathOutputCommitterWork() != null) { + dataCommitter = new PathOutputCommitterDataCommitter(work.getPathOutputCommitterWork() + .getJobContext(), work.getPathOutputCommitterWork().createPathOutputCommitter()); + } else { + dataCommitter = new HiveDataCommitter(work); + } + // Do any hive related operations like moving tables and files // to appropriate locations LoadFileDesc lfd = work.getLoadFileWork(); @@ -307,7 +199,7 @@ public int execute(DriverContext driverContext) { } } else { - moveFile(sourcePath, targetPath, lfd.getIsDfsDir()); + dataCommitter.moveFile(sourcePath, targetPath, lfd.getIsDfsDir(), conf, console); } } } @@ -328,7 +220,7 @@ public int execute(DriverContext driverContext) { destFs.mkdirs(destPath.getParent()); } Utilities.FILE_OP_LOGGER.debug("MoveTask moving (multi-file) " + srcPath + " to " + destPath); - moveFile(srcPath, destPath, isDfsDir); + dataCommitter.moveFile(srcPath, destPath, isDfsDir, conf, console); } else { if (!destFs.exists(destPath)) { destFs.mkdirs(destPath); @@ -340,7 +232,7 @@ public int execute(DriverContext driverContext) { Path childSrc = child.getPath(); Path childDest = new Path(destPath, filePrefix + childSrc.getName()); Utilities.FILE_OP_LOGGER.debug("MoveTask moving (multi-file) " + childSrc + " to " + childDest); - moveFile(childSrc, childDest, isDfsDir); + dataCommitter.moveFile(childSrc, childDest, isDfsDir, conf, console); } } else { Utilities.FILE_OP_LOGGER.debug("MoveTask skipping empty directory (multi-file) " + srcPath); @@ -373,7 +265,7 @@ public int execute(DriverContext driverContext) { } db.loadTable(tbd.getSourcePath(), tbd.getTable().getTableName(), tbd.getLoadFileType(), work.isSrcLocal(), isSkewedStoredAsDirs(tbd), isFullAcidOp, hasFollowingStatsTask(), - tbd.getWriteId(), tbd.getStmtId(), tbd.isInsertOverwrite()); + tbd.getWriteId(), tbd.getStmtId(), tbd.isInsertOverwrite(), dataCommitter); if (work.getOutputs() != null) { DDLTask.addIfAbsentByName(new WriteEntity(table, getWriteType(tbd, work.getLoadTableWork().getWriteType())), work.getOutputs()); @@ -387,9 +279,9 @@ public int execute(DriverContext driverContext) { // deal with dynamic partitions DynamicPartitionCtx dpCtx = tbd.getDPCtx(); if (dpCtx != null && dpCtx.getNumDPCols() > 0) { // dynamic partitions - dc = handleDynParts(db, table, tbd, ti, dpCtx); + dc = handleDynParts(db, table, tbd, ti, dpCtx, dataCommitter); } else { // static partitions - dc = handleStaticParts(db, table, tbd, ti); + dc = handleStaticParts(db, table, tbd, ti, dataCommitter); } } if (dc != null) { @@ -461,7 +353,8 @@ public void logMessage(LoadTableDesc tbd) { } private DataContainer handleStaticParts(Hive db, Table table, LoadTableDesc tbd, - TaskInformation ti) throws HiveException, IOException, InvalidOperationException { + TaskInformation ti, DataCommitter dataCommitter) throws HiveException, IOException, + InvalidOperationException { List partVals = MetaStoreUtils.getPvals(table.getPartCols(), tbd.getPartitionSpec()); db.validatePartitionNameCharacters(partVals); if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) { @@ -475,7 +368,7 @@ private DataContainer handleStaticParts(Hive db, Table table, LoadTableDesc tbd, work.getLoadTableWork().getWriteType() != AcidUtils.Operation.NOT_ACID && !tbd.isMmTable(), hasFollowingStatsTask(), - tbd.getWriteId(), tbd.getStmtId(), tbd.isInsertOverwrite()); + tbd.getWriteId(), tbd.getStmtId(), tbd.isInsertOverwrite(), dataCommitter); Partition partn = db.getPartition(table, tbd.getPartitionSpec(), false); // See the comment inside updatePartitionBucketSortColumns. @@ -494,7 +387,7 @@ private DataContainer handleStaticParts(Hive db, Table table, LoadTableDesc tbd, } private DataContainer handleDynParts(Hive db, Table table, LoadTableDesc tbd, - TaskInformation ti, DynamicPartitionCtx dpCtx) throws HiveException, + TaskInformation ti, DynamicPartitionCtx dpCtx, DataCommitter dataCommitter) throws HiveException, IOException, InvalidOperationException { DataContainer dc; List> dps = Utilities.getFullDPSpecs(conf, dpCtx); @@ -522,7 +415,8 @@ private DataContainer handleDynParts(Hive db, Table table, LoadTableDesc tbd, tbd.getStmtId(), hasFollowingStatsTask(), work.getLoadTableWork().getWriteType(), - tbd.isInsertOverwrite()); + tbd.isInsertOverwrite(), + dataCommitter); // publish DP columns to its subscribers if (dps != null && dps.size() > 0) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/PathOutputCommitterDataCommitter.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/PathOutputCommitterDataCommitter.java new file mode 100644 index 000000000000..fa424eade8df --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/PathOutputCommitterDataCommitter.java @@ -0,0 +1,59 @@ +package org.apache.hadoop.hive.ql.exec; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathFilter; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.metadata.Hive; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.session.SessionState; +import org.apache.hadoop.mapreduce.JobContext; +import org.apache.hadoop.mapreduce.lib.output.PathOutputCommitter; + +import java.io.IOException; +import java.util.List; + +/** + * A {@link DataCommitter} that commits Hive data using a {@link PathOutputCommitter}. + */ +class PathOutputCommitterDataCommitter implements DataCommitter { + + private final JobContext jobContext; + private final PathOutputCommitter pathOutputCommitter; + + PathOutputCommitterDataCommitter(JobContext jobContext, + PathOutputCommitter pathOutputCommitter) { + this.jobContext = jobContext; + this.pathOutputCommitter = pathOutputCommitter; + } + + @Override + public void moveFile(Path sourcePath, Path targetPath, boolean isDfsDir, HiveConf conf, + SessionState.LogHelper console) throws HiveException { + commitJob(); + } + + @Override + public void copyFiles(HiveConf conf, Path srcf, Path destf, FileSystem fs, boolean isSrcLocal, + boolean isAcidIUD, boolean isOverwrite, List newFiles, + boolean isBucketed, boolean isFullAcidTable, + boolean isManaged) throws HiveException { + commitJob(); + } + + @Override + public void replaceFiles(Path tablePath, Path srcf, Path destf, Path oldPath, HiveConf conf, + boolean isSrcLocal, boolean purge, List newFiles, + PathFilter deletePathFilter, boolean isNeedRecycle, boolean isManaged, + Hive hive) throws HiveException { + commitJob(); + } + + private void commitJob() throws HiveException { + try { + this.pathOutputCommitter.commitJob(this.jobContext); + } catch (IOException e) { + throw new HiveException(e); + } + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/PathOutputCommitterSetupTask.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/PathOutputCommitterSetupTask.java new file mode 100644 index 000000000000..dc1c62bf3dff --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/PathOutputCommitterSetupTask.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec; + +import org.apache.hadoop.hive.ql.DriverContext; +import org.apache.hadoop.hive.ql.plan.PathOutputCommitterWork; +import org.apache.hadoop.hive.ql.plan.api.StageType; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +public class PathOutputCommitterSetupTask extends Task { + + private static final Logger LOG = LoggerFactory.getLogger(PathOutputCommitterSetupTask.class); + + private static final long serialVersionUID = -8867710739987754989L; + + @Override + protected int execute(DriverContext driverContext) { + try { + LOG.info("Running setupJob for Path Output Committer " + + work.getPathOutputCommitterClass().getName()); + work.createPathOutputCommitter().setupJob(getWork().getJobContext()); + } catch (Exception e) { + LOG.error("Failed run setupJob for Path Output Committer " + + work.getPathOutputCommitterClass().getName(), e); + setException(e); + return 1; + } + return 0; + } + + @Override + public StageType getType() { + return null; + } + + @Override + public String getName() { + return null; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/TaskFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/TaskFactory.java index 3a107b7e8128..51562f3c4240 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/TaskFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/TaskFactory.java @@ -36,6 +36,7 @@ import org.apache.hadoop.hive.ql.io.merge.MergeFileTask; import org.apache.hadoop.hive.ql.io.merge.MergeFileWork; import org.apache.hadoop.hive.ql.plan.ColumnStatsUpdateWork; +import org.apache.hadoop.hive.ql.plan.PathOutputCommitterWork; import org.apache.hadoop.hive.ql.plan.StatsWork; import org.apache.hadoop.hive.ql.plan.ConditionalWork; import org.apache.hadoop.hive.ql.plan.CopyWork; @@ -113,6 +114,7 @@ public TaskTuple(Class workClass, Class> taskClass) { taskvec.add(new TaskTuple<>(ReplStateLogWork.class, ReplStateLogTask.class)); taskvec.add(new TaskTuple(ExportWork.class, ExportTask.class)); taskvec.add(new TaskTuple(ReplTxnWork.class, ReplTxnTask.class)); + taskvec.add(new TaskTuple<>(PathOutputCommitterWork.class, PathOutputCommitterSetupTask.class)); } private static ThreadLocal tid = new ThreadLocal() { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecMapper.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecMapper.java index d0038cd3d87b..d24a5357109e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecMapper.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecMapper.java @@ -22,6 +22,7 @@ import java.util.List; import java.util.Map; +import org.apache.hadoop.mapred.TaskAttemptID; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.CompilationOpContext; @@ -73,6 +74,8 @@ public class ExecMapper extends MapReduceBase implements Mapper { @Override public void configure(JobConf job) { + TaskAttemptID taskAttemptID = TaskAttemptID.forName(job.get("mapred.task.id")); + execContext = new ExecMapperContext(job); Utilities.tryLoggingClassPaths(job, l4j); setDone(false); @@ -100,6 +103,7 @@ public void configure(JobConf job) { execContext.setLocalWork(localWork); MapredContext.init(true, new JobConf(jc)); + MapredContext.get().setTaskAttemptID(taskAttemptID); mo.passExecContext(execContext); mo.initializeLocalWork(jc); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecReducer.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecReducer.java index 7ce1544839f1..5169c6bfa9ca 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecReducer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecReducer.java @@ -23,6 +23,7 @@ import java.util.Iterator; import java.util.List; +import org.apache.hadoop.mapred.TaskAttemptID; import org.apache.hadoop.hive.ql.exec.MapredContext; import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.Utilities; @@ -86,6 +87,8 @@ public class ExecReducer extends MapReduceBase implements Reducer { @Override public void configure(JobConf job) { + TaskAttemptID taskAttemptID = TaskAttemptID.forName(job.get("mapred.task.id")); + rowObjectInspector = new ObjectInspector[Byte.MAX_VALUE]; ObjectInspector[] valueObjectInspector = new ObjectInspector[Byte.MAX_VALUE]; ObjectInspector keyObjectInspector; @@ -127,6 +130,7 @@ public void configure(JobConf job) { } MapredContext.init(false, new JobConf(jc)); + MapredContext.get().setTaskAttemptID(taskAttemptID); // initialize reduce operator tree try { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/SparkMapRecordHandler.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/SparkMapRecordHandler.java index 7cd853f87810..8a85a074422f 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/SparkMapRecordHandler.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/SparkMapRecordHandler.java @@ -22,8 +22,6 @@ import java.util.Iterator; import java.util.List; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.CompilationOpContext; import org.apache.hadoop.hive.ql.exec.AbstractMapOperator; import org.apache.hadoop.hive.ql.exec.MapOperator; @@ -42,6 +40,10 @@ import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapreduce.TaskAttemptID; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** @@ -65,6 +67,8 @@ public void init(JobConf job, OutputCollector output, Reporter repo perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.SPARK_INIT_OPERATORS); super.init(job, output, reporter); + TaskAttemptID taskAttemptID = TaskAttemptID.forName(job.get("mapred.task.id")); + try { jc = job; execContext = new ExecMapperContext(jc); @@ -88,6 +92,7 @@ public void init(JobConf job, OutputCollector output, Reporter repo execContext.setLocalWork(localWork); MapredContext.init(true, new JobConf(jc)); + MapredContext.get().setTaskAttemptID(taskAttemptID); MapredContext.get().setReporter(reporter); mo.passExecContext(execContext); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/SparkReduceRecordHandler.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/SparkReduceRecordHandler.java index 6a7e1dfa59eb..7a129c84334b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/SparkReduceRecordHandler.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/SparkReduceRecordHandler.java @@ -24,8 +24,6 @@ import java.util.Iterator; import java.util.List; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.exec.MapredContext; import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.OperatorUtils; @@ -57,11 +55,16 @@ import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapreduce.TaskAttemptID; import org.apache.hadoop.util.ReflectionUtils; import org.apache.hadoop.util.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import com.google.common.base.Preconditions; + /** * Clone from ExecReducer, it is the bridge between the spark framework and * the Hive operator pipeline at execution time. It's main responsibilities are: @@ -122,6 +125,8 @@ public void init(JobConf job, OutputCollector output, Reporter reporter) throws perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.SPARK_INIT_OPERATORS); super.init(job, output, reporter); + TaskAttemptID taskAttemptID = TaskAttemptID.forName(job.get("mapred.task.id")); + rowObjectInspector = new ObjectInspector[Byte.MAX_VALUE]; ObjectInspector[] valueObjectInspector = new ObjectInspector[Byte.MAX_VALUE]; ObjectInspector keyObjectInspector; @@ -218,6 +223,8 @@ public void init(JobConf job, OutputCollector output, Reporter reporter) throws Utilities.reduceFieldNameList, ois); } } + MapredContext.init(false, new JobConf(jc)); + MapredContext.get().setTaskAttemptID(taskAttemptID); } catch (Exception e) { throw new RuntimeException(e); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java index 4d2e1a4e9adf..e9d877df1ceb 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java @@ -164,9 +164,11 @@ import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils; import org.apache.hadoop.hive.ql.ErrorMsg; import org.apache.hadoop.hive.ql.exec.AbstractFileMergeOperator; +import org.apache.hadoop.hive.ql.exec.DataCommitter; import org.apache.hadoop.hive.ql.exec.FunctionRegistry; import org.apache.hadoop.hive.ql.exec.FunctionTask; import org.apache.hadoop.hive.ql.exec.FunctionUtils; +import org.apache.hadoop.hive.ql.exec.HiveDataCommitter; import org.apache.hadoop.hive.ql.exec.SerializationUtilities; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.io.AcidUtils; @@ -195,6 +197,7 @@ import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.util.StringUtils; import org.apache.hive.common.util.TxnIdUtils; +import org.apache.hadoop.util.ReflectionUtils; import org.apache.thrift.TException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -438,6 +441,7 @@ public static void closeCurrent() { */ private Hive(HiveConf c, boolean doRegisterAllFns) throws HiveException { conf = c; + if (doRegisterAllFns) { registerAllFunctionsOnce(); } @@ -1725,7 +1729,7 @@ public Database getDatabaseCurrent() throws HiveException { public Partition loadPartition(Path loadPath, Table tbl, Map partSpec, LoadFileType loadFileType, boolean inheritTableSpecs, boolean isSkewedStoreAsSubdir, boolean isSrcLocal, boolean isAcidIUDoperation, boolean hasFollowingStatsTask, Long writeId, - int stmtId, boolean isInsertOverwrite) throws HiveException { + int stmtId, boolean isInsertOverwrite, DataCommitter dataCommitter) throws HiveException { Path tblDataLocationPath = tbl.getDataLocation(); boolean isMmTableWrite = AcidUtils.isInsertOnlyTable(tbl.getParameters()); assert tbl.getPath() != null : "null==getPath() for " + tbl.getTableName(); @@ -1814,11 +1818,12 @@ public Partition loadPartition(Path loadPath, Table tbl, Map par boolean isAutoPurge = "true".equalsIgnoreCase(tbl.getProperty("auto.purge")); boolean needRecycle = !tbl.isTemporary() && ReplChangeManager.isSourceOfReplication(Hive.get().getDatabase(tbl.getDbName())); - replaceFiles(tbl.getPath(), loadPath, destPath, oldPartPath, getConf(), isSrcLocal, - isAutoPurge, newFiles, FileUtils.HIDDEN_FILES_PATH_FILTER, needRecycle, isManaged); + dataCommitter.replaceFiles(tbl.getPath(), loadPath, destPath, oldPartPath, getConf(), isSrcLocal, + isAutoPurge, newFiles, FileUtils.HIDDEN_FILES_PATH_FILTER, needRecycle, isManaged, + this); } else { FileSystem fs = tbl.getDataLocation().getFileSystem(conf); - copyFiles(conf, loadPath, destPath, fs, isSrcLocal, isAcidIUDoperation, + dataCommitter.copyFiles(conf, loadPath, destPath, fs, isSrcLocal, isAcidIUDoperation, (loadFileType == LoadFileType.OVERWRITE_EXISTING), newFiles, tbl.getNumBuckets() > 0, isFullAcidTable, isManaged); } @@ -2197,7 +2202,7 @@ public Map, Partition> loadDynamicPartitions(final Path load final String tableName, final Map partSpec, final LoadFileType loadFileType, final int numDP, final int numLB, final boolean isAcid, final long writeId, final int stmtId, final boolean hasFollowingStatsTask, final AcidUtils.Operation operation, - boolean isInsertOverwrite) throws HiveException { + boolean isInsertOverwrite, DataCommitter dataCommitter) throws HiveException { final Map, Partition> partitionsMap = Collections.synchronizedMap(new LinkedHashMap, Partition>()); @@ -2245,8 +2250,8 @@ public Void call() throws Exception { // load the partition Partition newPartition = loadPartition(partPath, tbl, fullPartSpec, loadFileType, - true, numLB > 0, false, isAcid, hasFollowingStatsTask, writeId, stmtId, - isInsertOverwrite); + true, false, numLB > 0, isAcid, hasFollowingStatsTask, writeId, stmtId, + isInsertOverwrite, dataCommitter); partitionsMap.put(fullPartSpec, newPartition); if (inPlaceEligible) { @@ -2338,9 +2343,11 @@ public Void call() throws Exception { * @param writeId write ID allocated for the current load operation * @param stmtId statement ID of the current load statement */ - public void loadTable(Path loadPath, String tableName, LoadFileType loadFileType, boolean isSrcLocal, - boolean isSkewedStoreAsSubdir, boolean isAcidIUDoperation, boolean hasFollowingStatsTask, - Long writeId, int stmtId, boolean isInsertOverwrite) throws HiveException { + public void loadTable(Path loadPath, String tableName, LoadFileType loadFileType, + boolean isSrcLocal, boolean isSkewedStoreAsSubdir, + boolean isAcidIUDoperation, boolean hasFollowingStatsTask, Long writeId, + int stmtId, boolean isInsertOverwrite, + DataCommitter committer) throws HiveException { List newFiles = Collections.synchronizedList(new ArrayList()); Table tbl = getTable(tableName); @@ -2384,12 +2391,12 @@ public void loadTable(Path loadPath, String tableName, LoadFileType loadFileType boolean isAutopurge = "true".equalsIgnoreCase(tbl.getProperty("auto.purge")); boolean needRecycle = !tbl.isTemporary() && ReplChangeManager.isSourceOfReplication(Hive.get().getDatabase(tbl.getDbName())); - replaceFiles(tblPath, loadPath, destPath, tblPath, conf, isSrcLocal, isAutopurge, - newFiles, FileUtils.HIDDEN_FILES_PATH_FILTER, needRecycle, isManaged); + committer.replaceFiles(tblPath, loadPath, destPath, tblPath, conf, isSrcLocal, isAutopurge, + newFiles, FileUtils.HIDDEN_FILES_PATH_FILTER, needRecycle, isManaged, this); } else { try { FileSystem fs = tbl.getDataLocation().getFileSystem(conf); - copyFiles(conf, loadPath, destPath, fs, isSrcLocal, isAcidIUDoperation, + committer.copyFiles(conf, loadPath, destPath, fs, isSrcLocal, isAcidIUDoperation, loadFileType == LoadFileType.OVERWRITE_EXISTING, newFiles, tbl.getNumBuckets() > 0, isFullAcidTable, isManaged); } catch (IOException e) { @@ -3342,108 +3349,8 @@ public List showPrivilegeGrant( } } - private static void copyFiles(final HiveConf conf, final FileSystem destFs, - FileStatus[] srcs, final FileSystem srcFs, final Path destf, - final boolean isSrcLocal, boolean isOverwrite, - final List newFiles, boolean acidRename, boolean isManaged) throws HiveException { - - final HdfsUtils.HadoopFileStatus fullDestStatus; - try { - fullDestStatus = new HdfsUtils.HadoopFileStatus(conf, destFs, destf); - } catch (IOException e1) { - throw new HiveException(e1); - } - - if (!fullDestStatus.getFileStatus().isDirectory()) { - throw new HiveException(destf + " is not a directory."); - } - final List>> futures = new LinkedList<>(); - final ExecutorService pool = conf.getInt(ConfVars.HIVE_MOVE_FILES_THREAD_COUNT.varname, 25) > 0 ? - Executors.newFixedThreadPool(conf.getInt(ConfVars.HIVE_MOVE_FILES_THREAD_COUNT.varname, 25), - new ThreadFactoryBuilder().setDaemon(true).setNameFormat("Move-Thread-%d").build()) : null; - // For ACID non-bucketed case, the filenames have to be in the format consistent with INSERT/UPDATE/DELETE Ops, - // i.e, like 000000_0, 000001_0_copy_1, 000002_0.gz etc. - // The extension is only maintained for files which are compressed. - int taskId = 0; - // Sort the files - Arrays.sort(srcs); - String configuredOwner = HiveConf.getVar(conf, ConfVars.HIVE_LOAD_DATA_OWNER); - for (FileStatus src : srcs) { - FileStatus[] files; - if (src.isDirectory()) { - try { - files = srcFs.listStatus(src.getPath(), FileUtils.HIDDEN_FILES_PATH_FILTER); - } catch (IOException e) { - pool.shutdownNow(); - throw new HiveException(e); - } - } else { - files = new FileStatus[] {src}; - } - - final SessionState parentSession = SessionState.get(); - // Sort the files - Arrays.sort(files); - for (final FileStatus srcFile : files) { - final Path srcP = srcFile.getPath(); - final boolean needToCopy = needToCopy(srcP, destf, srcFs, destFs, configuredOwner, isManaged); - - final boolean isRenameAllowed = !needToCopy && !isSrcLocal; - - final String msg = "Unable to move source " + srcP + " to destination " + destf; - - // If we do a rename for a non-local file, we will be transfering the original - // file permissions from source to the destination. Else, in case of mvFile() where we - // copy from source to destination, we will inherit the destination's parent group ownership. - if (null == pool) { - try { - Path destPath = mvFile(conf, srcFs, srcP, destFs, destf, isSrcLocal, isOverwrite, isRenameAllowed, - acidRename ? taskId++ : -1); - - if (null != newFiles) { - newFiles.add(destPath); - } - } catch (Exception e) { - throw getHiveException(e, msg, "Failed to move: {}"); - } - } else { - // future only takes final or seemingly final values. Make a final copy of taskId - final int finalTaskId = acidRename ? taskId++ : -1; - futures.add(pool.submit(new Callable>() { - @Override - public ObjectPair call() throws HiveException { - SessionState.setCurrentSessionState(parentSession); - - try { - Path destPath = - mvFile(conf, srcFs, srcP, destFs, destf, isSrcLocal, isOverwrite, isRenameAllowed, finalTaskId); - - if (null != newFiles) { - newFiles.add(destPath); - } - return ObjectPair.create(srcP, destPath); - } catch (Exception e) { - throw getHiveException(e, msg); - } - } - })); - } - } - } - if (null != pool) { - pool.shutdown(); - for (Future> future : futures) { - try { - ObjectPair pair = future.get(); - LOG.debug("Moved src: {}, to dest: {}", pair.getFirst().toString(), pair.getSecond().toString()); - } catch (Exception e) { - throw handlePoolException(pool, e); - } - } - } - } - - private static boolean isSubDir(Path srcf, Path destf, FileSystem srcFs, FileSystem destFs, boolean isSrcLocal) { + public static boolean isSubDir(Path srcf, Path destf, FileSystem srcFs, FileSystem destFs, + boolean isSrcLocal) { if (srcf == null) { LOG.debug("The source path is null for isSubDir method."); return false; @@ -3486,84 +3393,6 @@ private static Path getQualifiedPathWithoutSchemeAndAuthority(Path srcf, FileSys return ShimLoader.getHadoopShims().getPathWithoutSchemeAndAuthority(path); } - private static String getPathName(int taskId) { - return Utilities.replaceTaskId("000000", taskId) + "_0"; - } - - /** - *

- * Moves a file from one {@link Path} to another. If {@code isRenameAllowed} is true then the - * {@link FileSystem#rename(Path, Path)} method is used to move the file. If its false then the data is copied, if - * {@code isSrcLocal} is true then the {@link FileSystem#copyFromLocalFile(Path, Path)} method is used, else - * {@link FileUtils#copy(FileSystem, Path, FileSystem, Path, boolean, boolean, HiveConf)} is used. - *

- * - *

- * If the destination file already exists, then {@code _copy_[counter]} is appended to the file name, where counter - * is an integer starting from 1. - *

- * - * @param conf the {@link HiveConf} to use if copying data - * @param sourceFs the {@link FileSystem} where the source file exists - * @param sourcePath the {@link Path} to move - * @param destFs the {@link FileSystem} to move the file to - * @param destDirPath the {@link Path} to move the file to - * @param isSrcLocal if the source file is on the local filesystem - * @param isOverwrite if true, then overwrite destination file if exist else make a duplicate copy - * @param isRenameAllowed true if the data should be renamed and not copied, false otherwise - * - * @return the {@link Path} the source file was moved to - * - * @throws IOException if there was an issue moving the file - */ - private static Path mvFile(HiveConf conf, FileSystem sourceFs, Path sourcePath, FileSystem destFs, Path destDirPath, - boolean isSrcLocal, boolean isOverwrite, boolean isRenameAllowed, - int taskId) throws IOException { - - // Strip off the file type, if any so we don't make: - // 000000_0.gz -> 000000_0.gz_copy_1 - final String fullname = sourcePath.getName(); - final String name; - if (taskId == -1) { // non-acid - name = FilenameUtils.getBaseName(sourcePath.getName()); - } else { // acid - name = getPathName(taskId); - } - final String type = FilenameUtils.getExtension(sourcePath.getName()); - - // Incase of ACID, the file is ORC so the extension is not relevant and should not be inherited. - Path destFilePath = new Path(destDirPath, taskId == -1 ? fullname : name); - - /* - * The below loop may perform bad when the destination file already exists and it has too many _copy_ - * files as well. A desired approach was to call listFiles() and get a complete list of files from - * the destination, and check whether the file exists or not on that list. However, millions of files - * could live on the destination directory, and on concurrent situations, this can cause OOM problems. - * - * I'll leave the below loop for now until a better approach is found. - */ - for (int counter = 1; destFs.exists(destFilePath); counter++) { - if (isOverwrite) { - destFs.delete(destFilePath, false); - break; - } - destFilePath = new Path(destDirPath, name + (Utilities.COPY_KEYWORD + counter) + - ((taskId == -1 && !type.isEmpty()) ? "." + type : "")); - } - - if (isRenameAllowed) { - destFs.rename(sourcePath, destFilePath); - } else if (isSrcLocal) { - destFs.copyFromLocalFile(sourcePath, destFilePath); - } else { - FileUtils.copy(sourceFs, sourcePath, destFs, destFilePath, - true, // delete source - false, // overwrite destination - conf); - } - return destFilePath; - } - // Clears the dest dir when src is sub-dir of dest. public static void clearDestForSubDirSrc(final HiveConf conf, Path dest, Path src, boolean isSrcLocal) throws IOException { @@ -3772,31 +3601,11 @@ public Void call() throws HiveException { } } - static private HiveException getHiveException(Exception e, String msg) { + static public HiveException getHiveException(Exception e, String msg) { return getHiveException(e, msg, null); } - static private HiveException handlePoolException(ExecutorService pool, Exception e) { - HiveException he = null; - - if (e instanceof HiveException) { - he = (HiveException) e; - if (he.getCanonicalErrorMsg() != ErrorMsg.GENERIC_ERROR) { - if (he.getCanonicalErrorMsg() == ErrorMsg.UNRESOLVED_RT_EXCEPTION) { - LOG.error("Failed to move: {}", he.getMessage()); - } else { - LOG.error("Failed to move: {}", he.getRemoteErrorMsg()); - } - } - } else { - LOG.error("Failed to move: {}", e.getMessage()); - he = new HiveException(e.getCause()); - } - pool.shutdownNow(); - return he; - } - - static private HiveException getHiveException(Exception e, String msg, String logMsg) { + static public HiveException getHiveException(Exception e, String msg, String logMsg) { // The message from remote exception includes the entire stack. The error thrown from // hive based on the remote exception needs only the first line. String hiveErrMsg = null; @@ -3823,12 +3632,10 @@ static private HiveException getHiveException(Exception e, String msg, String lo } /** - * If moving across different FileSystems or differnent encryption zone, need to do a File copy instead of rename. - * TODO- consider if need to do this for different file authority. - * @throws HiveException + * If moving across different FileSystems or different encryption zone, need to do a File copy instead of rename. */ static private boolean needToCopy(Path srcf, Path destf, FileSystem srcFs, - FileSystem destFs, String configuredOwner, boolean isManaged) throws HiveException { + FileSystem destFs, String configuredOwner, boolean isManaged) throws HiveException { //Check if different FileSystems if (!FileUtils.equalsFileSystem(srcFs, destFs)) { return true; @@ -3841,7 +3648,7 @@ static private boolean needToCopy(Path srcf, Path destf, FileSystem srcFs, try { srcs = srcFs.getFileStatus(srcf); - String runningUser = UserGroupInformation.getLoginUser().getUserName(); + String runningUser = UserGroupInformation.getLoginUser().getShortUserName(); boolean isOwned = FileUtils.isOwnerOfFileHierarchy(srcFs, srcs, configuredOwner, false); if (configuredOwner.equals(runningUser)) { // Check if owner has write permission, else it will have to copy @@ -3881,62 +3688,24 @@ static private boolean needToCopy(Path srcf, Path destf, FileSystem srcFs, } } - /** - * Copy files. This handles building the mapping for buckets and such between the source and - * destination - * @param conf Configuration object - * @param srcf source directory, if bucketed should contain bucket files - * @param destf directory to move files into - * @param fs Filesystem - * @param isSrcLocal true if source is on local file system - * @param isAcidIUD true if this is an ACID based Insert/Update/Delete - * @param isOverwrite if true, then overwrite if destination file exist, else add a duplicate copy - * @param newFiles if this is non-null, a list of files that were created as a result of this - * move will be returned. - * @param isManaged if table is managed. - * @throws HiveException - */ - static protected void copyFiles(HiveConf conf, Path srcf, Path destf, FileSystem fs, - boolean isSrcLocal, boolean isAcidIUD, - boolean isOverwrite, List newFiles, boolean isBucketed, - boolean isFullAcidTable, boolean isManaged) throws HiveException { - try { - // create the destination if it does not exist - if (!fs.exists(destf)) { - FileUtils.mkdir(fs, destf, conf); - } - } catch (IOException e) { - throw new HiveException( - "copyFiles: error while checking/creating destination directory!!!", - e); - } - - FileStatus[] srcs; - FileSystem srcFs; - try { - srcFs = srcf.getFileSystem(conf); - srcs = srcFs.globStatus(srcf); - } catch (IOException e) { - LOG.error(StringUtils.stringifyException(e)); - throw new HiveException("addFiles: filesystem error in check phase. " + e.getMessage(), e); - } - if (srcs == null) { - LOG.info("No sources specified to move: " + srcf); - return; - // srcs = new FileStatus[0]; Why is this needed? - } + static private HiveException handlePoolException(ExecutorService pool, Exception e) { + HiveException he = null; - // If we're moving files around for an ACID write then the rules and paths are all different. - // You can blame this on Owen. - if (isAcidIUD) { - moveAcidFiles(srcFs, srcs, destf, newFiles); + if (e instanceof HiveException) { + he = (HiveException) e; + if (he.getCanonicalErrorMsg() != ErrorMsg.GENERIC_ERROR) { + if (he.getCanonicalErrorMsg() == ErrorMsg.UNRESOLVED_RT_EXCEPTION) { + LOG.error("Failed to move: {}", he.getMessage()); + } else { + LOG.error("Failed to move: {}", he.getRemoteErrorMsg()); + } + } } else { - // For ACID non-bucketed case, the filenames have to be in the format consistent with INSERT/UPDATE/DELETE Ops, - // i.e, like 000000_0, 000001_0_copy_1, 000002_0.gz etc. - // The extension is only maintained for files which are compressed. - copyFiles(conf, fs, srcs, srcFs, destf, isSrcLocal, isOverwrite, - newFiles, isFullAcidTable && !isBucketed, isManaged); + LOG.error("Failed to move: {}", e.getMessage()); + he = new HiveException(e.getCause()); } + pool.shutdownNow(); + return he; } public static void moveAcidFiles(FileSystem fs, FileStatus[] stats, Path dst, @@ -4069,149 +3838,6 @@ private static void moveAcidFiles(String deltaFileType, PathFilter pathFilter, F } } - /** - * Replaces files in the partition with new data set specified by srcf. Works - * by renaming directory of srcf to the destination file. - * srcf, destf, and tmppath should resident in the same DFS, but the oldPath can be in a - * different DFS. - * - * @param tablePath path of the table. Used to identify permission inheritance. - * @param srcf - * Source directory to be renamed to tmppath. It should be a - * leaf directory where the final data files reside. However it - * could potentially contain subdirectories as well. - * @param destf - * The directory where the final data needs to go - * @param oldPath - * The directory where the old data location, need to be cleaned up. Most of time, will be the same - * as destf, unless its across FileSystem boundaries. - * @param purge - * When set to true files which needs to be deleted are not moved to Trash - * @param isSrcLocal - * If the source directory is LOCAL - * @param newFiles - * Output the list of new files replaced in the destination path - * @param isManaged - * If the table is managed. - */ - protected void replaceFiles(Path tablePath, Path srcf, Path destf, Path oldPath, HiveConf conf, - boolean isSrcLocal, boolean purge, List newFiles, PathFilter deletePathFilter, - boolean isNeedRecycle, boolean isManaged) throws HiveException { - try { - - FileSystem destFs = destf.getFileSystem(conf); - // check if srcf contains nested sub-directories - FileStatus[] srcs; - FileSystem srcFs; - try { - srcFs = srcf.getFileSystem(conf); - srcs = srcFs.globStatus(srcf); - } catch (IOException e) { - throw new HiveException("Getting globStatus " + srcf.toString(), e); - } - if (srcs == null) { - LOG.info("No sources specified to move: " + srcf); - return; - } - - if (oldPath != null) { - deleteOldPathForReplace(destf, oldPath, conf, purge, deletePathFilter, isNeedRecycle); - } - - // first call FileUtils.mkdir to make sure that destf directory exists, if not, it creates - // destf - boolean destfExist = FileUtils.mkdir(destFs, destf, conf); - if(!destfExist) { - throw new IOException("Directory " + destf.toString() - + " does not exist and could not be created."); - } - - // Two cases: - // 1. srcs has only a src directory, if rename src directory to destf, we also need to - // Copy/move each file under the source directory to avoid to delete the destination - // directory if it is the root of an HDFS encryption zone. - // 2. srcs must be a list of files -- ensured by LoadSemanticAnalyzer - // in both cases, we move the file under destf - if (srcs.length == 1 && srcs[0].isDirectory()) { - if (!moveFile(conf, srcs[0].getPath(), destf, true, isSrcLocal, isManaged)) { - throw new IOException("Error moving: " + srcf + " into: " + destf); - } - - // Add file paths of the files that will be moved to the destination if the caller needs it - if (null != newFiles) { - listNewFilesRecursively(destFs, destf, newFiles); - } - } else { - // its either a file or glob - for (FileStatus src : srcs) { - Path destFile = new Path(destf, src.getPath().getName()); - if (!moveFile(conf, src.getPath(), destFile, true, isSrcLocal, isManaged)) { - throw new IOException("Error moving: " + srcf + " into: " + destf); - } - - // Add file paths of the files that will be moved to the destination if the caller needs it - if (null != newFiles) { - newFiles.add(destFile); - } - } - } - } catch (IOException e) { - throw new HiveException(e.getMessage(), e); - } - } - - private void deleteOldPathForReplace(Path destPath, Path oldPath, HiveConf conf, boolean purge, - PathFilter pathFilter, boolean isNeedRecycle) throws HiveException { - Utilities.FILE_OP_LOGGER.debug("Deleting old paths for replace in " + destPath - + " and old path " + oldPath); - boolean isOldPathUnderDestf = false; - try { - FileSystem oldFs = oldPath.getFileSystem(conf); - FileSystem destFs = destPath.getFileSystem(conf); - // if oldPath is destf or its subdir, its should definitely be deleted, otherwise its - // existing content might result in incorrect (extra) data. - // But not sure why we changed not to delete the oldPath in HIVE-8750 if it is - // not the destf or its subdir? - isOldPathUnderDestf = isSubDir(oldPath, destPath, oldFs, destFs, false); - if (isOldPathUnderDestf) { - cleanUpOneDirectoryForReplace(oldPath, oldFs, pathFilter, conf, purge, isNeedRecycle); - } - } catch (IOException e) { - if (isOldPathUnderDestf) { - // if oldPath is a subdir of destf but it could not be cleaned - throw new HiveException("Directory " + oldPath.toString() - + " could not be cleaned up.", e); - } else { - //swallow the exception since it won't affect the final result - LOG.warn("Directory " + oldPath.toString() + " cannot be cleaned: " + e, e); - } - } - } - - - private void cleanUpOneDirectoryForReplace(Path path, FileSystem fs, - PathFilter pathFilter, HiveConf conf, boolean purge, boolean isNeedRecycle) throws IOException, HiveException { - if (isNeedRecycle && conf.getBoolVar(HiveConf.ConfVars.REPLCMENABLED)) { - recycleDirToCmPath(path, purge); - } - FileStatus[] statuses = fs.listStatus(path, pathFilter); - if (statuses == null || statuses.length == 0) { - return; - } - if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) { - String s = "Deleting files under " + path + " for replace: "; - for (FileStatus file : statuses) { - s += file.getPath().getName() + ", "; - } - Utilities.FILE_OP_LOGGER.trace(s); - } - - if (!trashFiles(fs, statuses, conf, purge)) { - throw new HiveException("Old path " + path + " has not been cleaned up."); - } - } - - /** * Trashes or deletes all files under a directory. Leaves the directory as is. * @param fs FileSystem to use @@ -4281,8 +3907,7 @@ public List exchangeTablePartitions(Map partitionSpec } /** - * Creates a metastore client. Currently it creates only JDBC based client as - * File based store support is removed + * Creates a metastore client using a factory specified via HiveConf. * * @returns a Meta Store Client * @throws HiveMetaException @@ -4300,12 +3925,8 @@ public HiveMetaHook getHook( } }; - if (conf.getBoolVar(ConfVars.METASTORE_FASTPATH)) { - return new SessionHiveMetaStoreClient(conf, hookLoader, allowEmbedded); - } else { - return RetryingMetaStoreClient.getProxy(conf, hookLoader, metaCallTimeMap, - SessionHiveMetaStoreClient.class.getName(), allowEmbedded); - } + HiveMetaStoreClientFactory factory = createMetaStoreClientFactory(); + return factory.createMetaStoreClient(conf, hookLoader, allowEmbedded, metaCallTimeMap); } @Nullable @@ -4324,6 +3945,24 @@ private HiveStorageHandler createStorageHandler(org.apache.hadoop.hive.metastore } } + private HiveMetaStoreClientFactory createMetaStoreClientFactory() throws MetaException { + String metaStoreClientFactoryClassName = + conf.getVar(HiveConf.ConfVars.METASTORE_CLIENT_FACTORY_CLASS); + + try { + Class factoryClass = + conf.getClassByName(metaStoreClientFactoryClassName) + .asSubclass(HiveMetaStoreClientFactory.class); + return ReflectionUtils.newInstance(factoryClass, conf); + } catch (Exception e) { + String errorMessage = String.format( + "Unable to instantiate a metastore client factory %s due to: %s", + metaStoreClientFactoryClassName, e); + LOG.error(errorMessage, e); + throw new MetaException(errorMessage); + } + } + public static class SchemaException extends MetaException { private static final long serialVersionUID = 1L; public SchemaException(String message) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveMetaStoreClientFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveMetaStoreClientFactory.java new file mode 100644 index 000000000000..ed17b3b14791 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveMetaStoreClientFactory.java @@ -0,0 +1,56 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.metadata; + +import java.util.concurrent.ConcurrentHashMap; + +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.HiveMetaHookLoader; +import org.apache.hadoop.hive.metastore.IMetaStoreClient; +import org.apache.hadoop.hive.metastore.api.MetaException; + +/** + * Abstract factory that defines an interface for other factories that produce concrete + * MetaStoreClient objects. + * + */ +public interface HiveMetaStoreClientFactory { + + /** + * A method for producing IMetaStoreClient objects. + * + * The implementation returned by this method must throw a MetaException if allowEmbedded = true + * and it does not support embedded mode. + * + * @param conf + * Hive Configuration. + * @param hookLoader + * Hook for handling events related to tables. + * @param allowEmbedded + * Flag indicating the implementation must run in-process, e.g. for unit testing or + * "fast path". + * @param metaCallTimeMap + * A container for storing entry and exit timestamps of IMetaStoreClient method + * invocations. + * @return IMetaStoreClient An implementation of IMetaStoreClient. + * @throws MetaException + */ + IMetaStoreClient createMetaStoreClient(HiveConf conf, HiveMetaHookLoader hookLoader, + boolean allowEmbedded, ConcurrentHashMap metaCallTimeMap) throws MetaException; +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java index f1c4d9827bd5..b7d2b2f705ce 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java @@ -20,7 +20,13 @@ import java.util.ArrayList; import java.util.List; +import java.util.concurrent.ConcurrentHashMap; +import org.apache.hadoop.hive.metastore.HiveMetaHook; +import org.apache.hadoop.hive.metastore.HiveMetaHookLoader; +import org.apache.hadoop.hive.metastore.IMetaStoreClient; +import org.apache.hadoop.hive.metastore.api.MetaException; +import org.apache.hadoop.hive.metastore.api.Table; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -438,4 +444,36 @@ public static String getReplPolicy(String dbName, String tableName) { return dbName.toLowerCase() + "." + tableName.toLowerCase(); } } + + public static IMetaStoreClient createMetaStoreClient(final HiveConf conf, boolean allowEmbedded, ConcurrentHashMap metaCallTimeMap) throws MetaException { + HiveMetaHookLoader hookLoader = new HiveMetaHookLoader() { + public HiveMetaHook getHook(Table tbl) throws MetaException { + try { + if (tbl == null) { + return null; + } else { + HiveStorageHandler storageHandler = HiveUtils.getStorageHandler(conf, (String)tbl.getParameters().get("storage_handler")); + return storageHandler == null ? null : storageHandler.getMetaHook(); + } + } catch (HiveException var3) { + HiveUtils.LOG.error(StringUtils.stringifyException(var3)); + throw new MetaException("Failed to load storage handler: " + var3.getMessage()); + } + } + }; + return createMetaStoreClientFactory(conf).createMetaStoreClient(conf, hookLoader, allowEmbedded, metaCallTimeMap); + } + + private static HiveMetaStoreClientFactory createMetaStoreClientFactory(HiveConf conf) throws MetaException { + String metaStoreClientFactoryClassName = conf.getVar(HiveConf.ConfVars.METASTORE_CLIENT_FACTORY_CLASS); + + try { + Class factoryClass = conf.getClassByName(metaStoreClientFactoryClassName).asSubclass(HiveMetaStoreClientFactory.class); + return (HiveMetaStoreClientFactory)ReflectionUtils.newInstance(factoryClass, conf); + } catch (Exception e) { + String errorMessage = String.format("Unable to instantiate a metastore client factory %s due to: %s", metaStoreClientFactoryClassName, e); + LOG.error(errorMessage, e); + throw new MetaException(errorMessage); + } + } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/SessionHiveMetaStoreClientFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/metadata/SessionHiveMetaStoreClientFactory.java new file mode 100644 index 000000000000..9d8445f3b6dc --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/SessionHiveMetaStoreClientFactory.java @@ -0,0 +1,55 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.metadata; + +import static com.google.common.base.Preconditions.checkNotNull; + +import java.util.concurrent.ConcurrentHashMap; + +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.conf.HiveConf.ConfVars; +import org.apache.hadoop.hive.metastore.HiveMetaHookLoader; +import org.apache.hadoop.hive.metastore.IMetaStoreClient; +import org.apache.hadoop.hive.metastore.RetryingMetaStoreClient; +import org.apache.hadoop.hive.metastore.api.MetaException; + +/** + * Default MetaStoreClientFactory for Hive which produces SessionHiveMetaStoreClient objects. + * + */ +public final class SessionHiveMetaStoreClientFactory implements HiveMetaStoreClientFactory { + + @Override + public IMetaStoreClient createMetaStoreClient(HiveConf conf, HiveMetaHookLoader hookLoader, + boolean allowEmbedded, + ConcurrentHashMap metaCallTimeMap) throws MetaException { + + checkNotNull(conf, "conf cannot be null!"); + checkNotNull(hookLoader, "hookLoader cannot be null!"); + checkNotNull(metaCallTimeMap, "metaCallTimeMap cannot be null!"); + + if (conf.getBoolVar(ConfVars.METASTORE_FASTPATH)) { + return new SessionHiveMetaStoreClient(conf, hookLoader, allowEmbedded); + } else { + return RetryingMetaStoreClient.getProxy(conf, hookLoader, metaCallTimeMap, + SessionHiveMetaStoreClient.class.getName(), allowEmbedded); + } + } + +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/PathOutputCommitterResolver.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/PathOutputCommitterResolver.java new file mode 100644 index 000000000000..e7a3d6fb99d8 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/PathOutputCommitterResolver.java @@ -0,0 +1,361 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer.physical; + +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.s3a.commit.CommitConstants; +import org.apache.hadoop.fs.s3a.commit.InternalCommitterConstants; + +import org.apache.hadoop.hive.common.BlobStorageUtils; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.Warehouse; +import org.apache.hadoop.hive.metastore.api.MetaException; +import org.apache.hadoop.hive.ql.exec.FileSinkOperator; +import org.apache.hadoop.hive.ql.exec.MoveTask; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.PathOutputCommitterSetupTask; +import org.apache.hadoop.hive.ql.exec.Task; +import org.apache.hadoop.hive.ql.exec.TaskFactory; +import org.apache.hadoop.hive.ql.exec.mr.MapRedTask; +import org.apache.hadoop.hive.ql.exec.spark.SparkTask; +import org.apache.hadoop.hive.ql.exec.tez.TezTask; +import org.apache.hadoop.hive.ql.lib.Dispatcher; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.TaskGraphWalker; +import org.apache.hadoop.hive.ql.metadata.Hive; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.metadata.Partition; +import org.apache.hadoop.hive.ql.optimizer.GenMapRedUtils; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.BaseWork; +import org.apache.hadoop.hive.ql.plan.FileSinkDesc; +import org.apache.hadoop.hive.ql.plan.MoveWork; +import org.apache.hadoop.hive.ql.plan.PathOutputCommitterWork; + +import org.apache.hadoop.mapreduce.JobContext; +import org.apache.hadoop.mapreduce.JobID; +import org.apache.hadoop.mapreduce.TaskAttemptContext; +import org.apache.hadoop.mapreduce.TaskAttemptID; +import org.apache.hadoop.mapreduce.TaskType; +import org.apache.hadoop.mapreduce.lib.output.PathOutputCommitterFactory; +import org.apache.hadoop.mapreduce.task.JobContextImpl; +import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Stack; +import java.util.UUID; +import java.util.stream.Collectors; + + +/** + * This {@link PhysicalPlanResolver} should only be triggered if + * {@link HiveConf.ConfVars#HIVE_BLOBSTORE_USE_OUTPUTCOMMITTER} is set to true. This class is + * used to integrate Hive with a specified + * {@link org.apache.hadoop.mapreduce.lib.output.PathOutputCommitter}. When triggered + * the {@link #resolve(PhysicalContext)} method will modify the operator and task DAGs so that + * they run all necessary stages of the specified PathOutputCommitter. Since Output Committers + * specify how data should be committed, this class only modifies query plans that write data. + * Integration with Output Committers is done via modifying the {@link FileSinkOperator} and + * {@link MoveTask} as well as introducing a {@link PathOutputCommitterSetupTask} to the task DAG. + * + *

+ * This class currently has the following restrictions: (1) it is not triggered for dynamic + * partitioning queries, and (2) it is not triggered when the merge-small-files job is enabled. + *

+ */ +public class PathOutputCommitterResolver implements PhysicalPlanResolver { + + private static final Logger LOG = LoggerFactory.getLogger(PathOutputCommitterResolver.class); + + // Useful for testing + private static final String HIVE_BLOBSTORE_COMMIT_DISABLE_EXPLAIN = "hive.blobstore.commit." + + "disable.explain"; + + private final Map, Collection> taskToFsOps = new HashMap<>(); + private final List> mvTasks = new ArrayList<>(); + private HiveConf hconf; + + @Override + public PhysicalContext resolve(PhysicalContext pctx) throws SemanticException { + this.hconf = pctx.getConf(); + + LOG.info("PathOutputCommitterResolver: resolve() invoked. hive.blobstore.use.output-committer={}", + hconf.getBoolVar(HiveConf.ConfVars.HIVE_BLOBSTORE_USE_OUTPUTCOMMITTER)); + + // Collect all MoveTasks and FSOPs + TaskGraphWalker graphWalker = new TaskGraphWalker(new PathOutputCommitterDispatcher()); + List rootTasks = new ArrayList<>(pctx.getRootTasks()); + graphWalker.startWalking(rootTasks, null); + + // Find MoveTasks with no child MoveTask + List> sinkMoveTasks = mvTasks.stream() + .filter(mvTask -> !containsChildTask(mvTask.getChildTasks(), MoveTask.class)) + .collect(Collectors.toList()); + + LOG.info("PathOutputCommitterResolver: Found {} MoveTasks, {} sinkMoveTasks, {} tasks with FsOps", + mvTasks.size(), sinkMoveTasks.size(), taskToFsOps.size()); + + // Iterate through each FSOP + for (Map.Entry, Collection> entry : taskToFsOps.entrySet()) { + for (FileSinkOperator fsOp : entry.getValue()) { + try { + processFsOp(entry.getKey(), fsOp, sinkMoveTasks); + } catch (HiveException | MetaException e) { + throw new SemanticException(e); + } + } + } + + return pctx; + } + + private boolean containsChildTask(List> mvTasks, Class + taskClass) { + if (mvTasks == null) { + return false; + } + boolean containsChildTask = false; + for (Task mvTask : mvTasks) { + if (taskClass.isInstance(mvTask)) { + return true; + } + containsChildTask = containsChildTask(mvTask.getChildTasks(), taskClass); + } + return containsChildTask; + } + + private class PathOutputCommitterDispatcher implements Dispatcher { + + @Override + public Object dispatch(Node nd, Stack stack, + Object... nodeOutputs) throws SemanticException { + + Task task = (Task) nd; + Collection fsOps = getAllFsOps(task); + if (!fsOps.isEmpty()) { + taskToFsOps.put((Task) nd, fsOps); + } + if (nd instanceof MoveTask) { + mvTasks.add((MoveTask) nd); + } + return null; + } + } + + private Collection getAllFsOps(Task task) { + Collection> fsOps = new ArrayList<>(); + if (task instanceof MapRedTask) { + fsOps.addAll(((MapRedTask) task).getWork().getAllOperators()); + } else if (task instanceof SparkTask) { + for (BaseWork work : ((SparkTask) task).getWork().getAllWork()) { + fsOps.addAll(work.getAllOperators()); + } + } else if (task instanceof TezTask) { + for (BaseWork work : ((TezTask) task).getWork().getAllWork()) { + fsOps.addAll(work.getAllOperators()); + } + } + return fsOps.stream() + .filter(FileSinkOperator.class::isInstance) + .map(FileSinkOperator.class::cast) + .collect(Collectors.toList()); + } + + private void processFsOp(Task task, FileSinkOperator fsOp, + List> sinkMoveTasks) throws HiveException, MetaException { + FileSinkDesc fileSinkDesc = fsOp.getConf(); + + // Get the MoveTask that will process the output of the fsOp + Task mvTask = GenMapRedUtils.findMoveTaskForFsopOutput(sinkMoveTasks, + fileSinkDesc.getFinalDirName(), fileSinkDesc.isMmTable()); + + LOG.info("PathOutputCommitterResolver: Processing FsOp with finalDirName={}, isMmTable={}, mvTask={}", + fileSinkDesc.getFinalDirName(), fileSinkDesc.isMmTable(), mvTask); + + if (mvTask != null) { + + MoveWork mvWork = mvTask.getWork(); + + // Don't process the mvTask if it requires committing data for DP queries + boolean hasLoadMultiFiles = mvWork.getLoadMultiFilesWork() != null; + boolean hasLoadTableWithDP = mvWork.getLoadTableWork() != null && mvWork.getLoadTableWork().getDPCtx() != null; + LOG.info("PathOutputCommitterResolver: hasLoadMultiFiles={}, hasLoadTableWithDP={}, loadFileWork={}, loadTableWork={}", + hasLoadMultiFiles, hasLoadTableWithDP, mvWork.getLoadFileWork(), mvWork.getLoadTableWork()); + + // Throw exception if dynamic partitioning is used with blobstore output committer + // Magic committer does not support dynamic partitioning due to JobID mismatch issues with Tez + // See HADOOP-19091 and HIVE-19321 for details + if (hasLoadTableWithDP || hasLoadMultiFiles) { + throw new SemanticException("Magic committer (hive.blobstore.use.output-committer=true) " + + "is not supported with dynamic partitioning. Either set hive.blobstore.use.output-committer=false " + + "or use static partitioning. See HADOOP-19091 and HIVE-19321 for details."); + } + + if (!hasLoadMultiFiles && !hasLoadTableWithDP) { + + // The final output path we will commit data to + Path outputPath = null; + + // Instead of picking between load table work and load file work, throw an exception if + // they are both set (this should never happen) + if (mvWork.getLoadTableWork() != null && mvWork.getLoadFileWork() != null) { + throw new IllegalArgumentException("Load Table Work and Load File Work cannot both be " + + "set"); + } + + // If there is a load file work, get its output path + if (mvWork.getLoadFileWork() != null) { + outputPath = getLoadFileOutputPath(mvWork); + } + + // If there is a load table work, get is output path + if (mvTask.getWork().getLoadTableWork() != null) { + outputPath = getLoadTableOutputPath(mvWork); + } + if (outputPath != null) { + boolean isBlobPath = BlobStorageUtils.isBlobStoragePath(hconf, outputPath); + String committerFactory = hconf.get(String.format(PathOutputCommitterFactory.COMMITTER_FACTORY_SCHEME_PATTERN, + outputPath.toUri().getScheme())); + LOG.info("PathOutputCommitterResolver: outputPath={}, isBlobPath={}, committerFactory={}", + outputPath, isBlobPath, committerFactory); + + if (isBlobPath && committerFactory != null) { + + // All s3a specific logic should be place in the method below, for all filesystems or + // output committer implementations, a similar pattern should be followed + if ("s3a".equals(outputPath.toUri().getScheme())) { + setupS3aOutputCommitter(mvWork); + } + + PathOutputCommitterWork setupWork = createPathOutputCommitterWork(outputPath); + mvWork.setPathOutputCommitterWork(setupWork); + + fileSinkDesc.setHasOutputCommitter(true); + fileSinkDesc.setTargetDirName(outputPath.toString()); + + LOG.info("Using Output Committer " + setupWork.getPathOutputCommitterClass() + + " for MoveTask: " + mvTask + ", FileSinkOperator: " + fsOp + " and output " + + "path: " + outputPath); + + if (hconf.getBoolean(HIVE_BLOBSTORE_COMMIT_DISABLE_EXPLAIN, false)) { + PathOutputCommitterSetupTask setupTask = new PathOutputCommitterSetupTask(); + setupTask.setWork(setupWork); + setupTask.executeTask(null); + } else { + task.addDependentTask(TaskFactory.get(setupWork)); + } + } + } + } + } + } + + // Config key for sharing JobID between planning (PathOutputCommitterResolver) and execution (FileSinkOperator) + public static final String HIVE_MAGIC_COMMITTER_JOB_ID = "hive.magic.committer.job.id"; + + private PathOutputCommitterWork createPathOutputCommitterWork(Path outputPath) { + // Use a deterministic JobID so that FileSinkOperator can create matching TaskAttemptIDs. + // This ensures the magic committer's __magic_job-{id} paths are consistent between + // task writes and job commit. See HADOOP-19091. + JobID jobID = new JobID("", 0); + + // Store the JobID in config so FileSinkOperator can use the same one + hconf.set(HIVE_MAGIC_COMMITTER_JOB_ID, jobID.toString()); + LOG.info("PathOutputCommitterResolver: Using JobID {} for magic committer", jobID); + + TaskAttemptContext taskAttemptContext = createTaskAttemptContext(jobID); + JobContext jobContext = new JobContextImpl(hconf, jobID); + + return new PathOutputCommitterWork(outputPath.toString(), + jobContext, taskAttemptContext); + } + + /** + * Setups any necessary configuration specific to a s3a. All s3a specific logic should be + * encapsulated within this method. + */ + private void setupS3aOutputCommitter(MoveWork mvWork) { + if (mvWork.getLoadTableWork() != null && mvWork.getLoadTableWork() + .isInsertOverwrite()) { + hconf.set(CommitConstants.FS_S3A_COMMITTER_STAGING_CONFLICT_MODE, CommitConstants. + CONFLICT_MODE_REPLACE); + } else { + hconf.set(CommitConstants.FS_S3A_COMMITTER_STAGING_CONFLICT_MODE, CommitConstants + .CONFLICT_MODE_APPEND); + } + + // We set this to false because its better for Hive to have more control over the file + // names given the that the JobID we are creating has no meaning + hconf.setBoolean(CommitConstants.FS_S3A_COMMITTER_STAGING_UNIQUE_FILENAMES, false); + + // Since the Hive-PathOutputCommitter integration is done manually (e.g. not through a + // framework such as MR or Spark) there is no auto-assigned staging ID, so we add one ourselves + hconf.set("fs.s3a.committer.staging.uuid", UUID.randomUUID().toString()); + } + + // Somewhat copied from TaskCompiler, which uses similar logic to get the default location for + // the target table in a CTAS query + private Path getDefaultPartitionPath(Path tablePath, Map partitionSpec) + throws MetaException { + Warehouse wh = new Warehouse(hconf); + return wh.getPartitionPath(tablePath, partitionSpec); + } + + private TaskAttemptContext createTaskAttemptContext(JobID jobID) { + return new TaskAttemptContextImpl(hconf, + new TaskAttemptID(jobID.getJtIdentifier(), jobID.getId(), TaskType.JOB_SETUP, 0, 0)); + } + + private Path getLoadFileOutputPath(MoveWork mvWork) { + return mvWork.getLoadFileWork().getTargetDir(); + } + + private Path getLoadTableOutputPath(MoveWork mvWork) throws HiveException, MetaException { + if (mvWork.getLoadTableWork().getPartitionSpec() != null && + !mvWork.getLoadTableWork().getPartitionSpec().isEmpty()) { + return getLoadPartitionOutputPath(mvWork); + } else { + // should probably replace this with Hive.getTable().getDataLocation() + return new Path(mvWork.getLoadTableWork().getTable().getProperties() + .getProperty("location")); // INSERT INTO ... VALUES (...) + } + } + + private Path getLoadPartitionOutputPath(MoveWork mvWork) throws HiveException, MetaException { + Hive db = Hive.get(); + Partition partition = db.getPartition(db.getTable(mvWork.getLoadTableWork() + .getTable().getTableName()), + mvWork.getLoadTableWork().getPartitionSpec(), false); + if (partition != null) { + return partition.getDataLocation(); + } else { + return getDefaultPartitionPath(db.getTable(mvWork.getLoadTableWork() + .getTable().getTableName()).getDataLocation(), mvWork + .getLoadTableWork().getPartitionSpec()); + } + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/PhysicalOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/PhysicalOptimizer.java index d508d02ed1ef..c3a578663aea 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/PhysicalOptimizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/PhysicalOptimizer.java @@ -22,6 +22,8 @@ import java.util.List; import org.apache.hadoop.hive.conf.HiveConf; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.parse.SemanticException; @@ -30,6 +32,7 @@ * PhysicalPlanResolver. Each resolver has its own set of optimization rule. */ public class PhysicalOptimizer { + private static final Logger LOG = LoggerFactory.getLogger(PhysicalOptimizer.class); private PhysicalContext pctx; private List resolvers; @@ -98,6 +101,13 @@ private void initialize(HiveConf hiveConf) { if (pctx.getContext().getExplainAnalyze() != null) { resolvers.add(new AnnotateRunTimeStatsOptimizer()); } + + boolean useOutputCommitter = hiveConf.getBoolVar(HiveConf.ConfVars.HIVE_BLOBSTORE_USE_OUTPUTCOMMITTER); + LOG.info("PhysicalOptimizer: hive.blobstore.use.output-committer = {}", useOutputCommitter); + if (useOutputCommitter) { + LOG.info("PhysicalOptimizer: Adding PathOutputCommitterResolver"); + resolvers.add(new PathOutputCommitterResolver()); + } } /** diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java index dfd790853b2f..bc9551bfbb53 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java @@ -87,6 +87,7 @@ import org.apache.hadoop.hive.ql.optimizer.physical.MemoryDecider; import org.apache.hadoop.hive.ql.optimizer.physical.MetadataOnlyOptimizer; import org.apache.hadoop.hive.ql.optimizer.physical.NullScanOptimizer; +import org.apache.hadoop.hive.ql.optimizer.physical.PathOutputCommitterResolver; import org.apache.hadoop.hive.ql.optimizer.physical.PhysicalContext; import org.apache.hadoop.hive.ql.optimizer.physical.SerializeFilter; import org.apache.hadoop.hive.ql.optimizer.physical.StageIDsRearranger; @@ -710,6 +711,12 @@ protected void optimizeTaskPlan(List> rootTasks, Pa new AnnotateRunTimeStatsOptimizer().resolve(physicalCtx); } + if (conf.getBoolVar(HiveConf.ConfVars.HIVE_BLOBSTORE_USE_OUTPUTCOMMITTER)) { + physicalCtx = new PathOutputCommitterResolver().resolve(physicalCtx); + } else { + LOG.debug("Skipping PathOutputCommitterResolver"); + } + perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.TEZ_COMPILER, "optimizeTaskPlan"); return; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/spark/SparkCompiler.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/spark/SparkCompiler.java index 0a76ffa28bd9..b13bb3ae80b9 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/spark/SparkCompiler.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/spark/SparkCompiler.java @@ -70,6 +70,7 @@ import org.apache.hadoop.hive.ql.optimizer.physical.MetadataOnlyOptimizer; import org.apache.hadoop.hive.ql.optimizer.physical.NullScanOptimizer; import org.apache.hadoop.hive.ql.optimizer.physical.PhysicalContext; +import org.apache.hadoop.hive.ql.optimizer.physical.PathOutputCommitterResolver; import org.apache.hadoop.hive.ql.optimizer.physical.SparkCrossProductCheck; import org.apache.hadoop.hive.ql.optimizer.physical.SparkDynamicPartitionPruningResolver; import org.apache.hadoop.hive.ql.optimizer.physical.SparkMapJoinResolver; @@ -613,6 +614,12 @@ protected void optimizeTaskPlan(List> rootTasks, Pa new AnnotateRunTimeStatsOptimizer().resolve(physicalCtx); } + if (conf.getBoolVar(HiveConf.ConfVars.HIVE_BLOBSTORE_USE_OUTPUTCOMMITTER)) { + new PathOutputCommitterResolver().resolve(physicalCtx); + } else { + LOG.debug("Skipping S3A commit optimizer"); + } + PERF_LOGGER.PerfLogEnd(CLASS_NAME, PerfLogger.SPARK_OPTIMIZE_TASK_TREE); return; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/FileSinkDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/FileSinkDesc.java index 42b8f40fc829..ee8bbbb85aee 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/FileSinkDesc.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/FileSinkDesc.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hive.ql.plan; +import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Objects; @@ -28,6 +29,9 @@ import org.apache.hadoop.hive.ql.optimizer.signature.Signature; import org.apache.hadoop.hive.ql.plan.Explain.Level; import org.apache.hadoop.hive.ql.plan.Explain.Vectorization; +import org.apache.hadoop.mapreduce.JobContext; +import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter; +import org.apache.hadoop.mapreduce.lib.output.PathOutputCommitter; /** * FileSinkDesc. @@ -63,6 +67,8 @@ public enum DPSortState { private DynamicPartitionCtx dpCtx; private String staticSpec; // static partition spec ends with a '/' private boolean gatherStats; + private String targetDirName; + private boolean hasOutputCommitter; // Consider a query like: // insert overwrite table T3 select ... from T1 join T2 on T1.key = T2.key; @@ -592,6 +598,14 @@ public boolean getInsertOverwrite() { return isInsertOverwrite; } + public String getTargetDirName() { + return this.targetDirName; + } + + public void setTargetDirName(String targetDirName) { + this.targetDirName = targetDirName; + } + @Override public boolean isSame(OperatorDesc other) { if (getClass().getName().equals(other.getClass().getName())) { @@ -610,4 +624,11 @@ public boolean isSame(OperatorDesc other) { return false; } + public void setHasOutputCommitter(boolean hasOutputCommitter) { + this.hasOutputCommitter = hasOutputCommitter; + } + + public boolean getHasOutputCommitter() { + return this.hasOutputCommitter; + } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/MoveWork.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/MoveWork.java index 9a1e3a1af5d9..9270b02485d5 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/MoveWork.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/MoveWork.java @@ -38,8 +38,10 @@ public class MoveWork implements Serializable { private LoadTableDesc loadTableWork; private LoadFileDesc loadFileWork; private LoadMultiFilesDesc loadMultiFilesWork; + private PathOutputCommitterWork pathOutputCommitterWork; private boolean checkFileFormat; private boolean srcLocal; + private boolean needCleanTarget; /** * ReadEntitites that are passed to the hooks. @@ -153,5 +155,21 @@ public boolean isSrcLocal() { public void setSrcLocal(boolean srcLocal) { this.srcLocal = srcLocal; } - + + public boolean isNeedCleanTarget() { + return needCleanTarget; + } + + public void setNeedCleanTarget(boolean needCleanTarget) { + this.needCleanTarget = needCleanTarget; + } + + public PathOutputCommitterWork getPathOutputCommitterWork() { + return this.pathOutputCommitterWork; + } + + public void setPathOutputCommitterWork( + PathOutputCommitterWork pathOutputCommitterWork) { + this.pathOutputCommitterWork = pathOutputCommitterWork; + } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/PathOutputCommitterWork.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/PathOutputCommitterWork.java new file mode 100644 index 000000000000..4caed5584894 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/PathOutputCommitterWork.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.plan; + +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapreduce.JobContext; +import org.apache.hadoop.mapreduce.TaskAttemptContext; +import org.apache.hadoop.mapreduce.lib.output.PathOutputCommitter; +import org.apache.hadoop.mapreduce.lib.output.PathOutputCommitterFactory; + +import java.io.IOException; +import java.io.Serializable; + + +@Explain(displayName = "Path Output Committer Setup Work", explainLevels = {Explain.Level.USER, Explain.Level.DEFAULT, Explain.Level.EXTENDED}) +public class PathOutputCommitterWork implements Serializable { + + private static final long serialVersionUID = -6333040835478371176L; + + private String outputPath; + private transient JobContext jobContext; + private transient TaskAttemptContext taskAttemptContext; + + public PathOutputCommitterWork(String outputPath, JobContext jobContext, + TaskAttemptContext taskAttemptContext) { + this.outputPath = outputPath; + this.jobContext = jobContext; + this.taskAttemptContext = taskAttemptContext; + } + + @Explain(displayName = "Path Output Committer Factory") + public Class getPathOutputCommitterClass() { + return PathOutputCommitterFactory.getCommitterFactory(new Path(this.outputPath), this.jobContext + .getConfiguration()).getClass(); + } + + @Explain(displayName = "Path Output Path") + public String getOutputPath() { + return this.outputPath; + } + + public JobContext getJobContext() { + return this.jobContext; + } + + public PathOutputCommitter createPathOutputCommitter() throws IOException { + return PathOutputCommitterFactory.createCommitter(new Path(this.outputPath), + this.taskAttemptContext); + } +} diff --git a/ql/src/test/org/apache/hadoop/hive/ql/exec/TestExecDriver.java b/ql/src/test/org/apache/hadoop/hive/ql/exec/TestExecDriver.java index e1086846600b..6fbf19085c29 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/exec/TestExecDriver.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/exec/TestExecDriver.java @@ -142,7 +142,7 @@ public class TestExecDriver extends TestCase { db.createTable(src, cols, null, TextInputFormat.class, HiveIgnoreKeyTextOutputFormat.class); db.loadTable(hadoopDataFile[i], src, LoadFileType.KEEP_EXISTING, - true, false, false, false, null, 0, false); + true, false, false, false, null, 0, false, new HiveDataCommitter()); i++; } diff --git a/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHive.java b/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHive.java index 81418de1f20e..d939973a9b3f 100755 --- a/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHive.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHive.java @@ -19,6 +19,8 @@ package org.apache.hadoop.hive.ql.metadata; import static org.apache.hadoop.hive.metastore.Warehouse.DEFAULT_DATABASE_NAME; +import static org.junit.Assert.assertThat; +import static org.hamcrest.CoreMatchers.instanceOf; import java.util.ArrayList; import java.util.Arrays; @@ -33,6 +35,8 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; +import org.apache.hadoop.hive.metastore.IMetaStoreClient; +import org.apache.hadoop.hive.metastore.Warehouse; import org.apache.hadoop.hive.metastore.PartitionDropOptions; import org.apache.hadoop.hive.metastore.Warehouse; import org.apache.hadoop.hive.metastore.api.Database; @@ -766,7 +770,41 @@ public void testHiveRefreshOnConfChange() throws Throwable{ newHiveObj = Hive.get(newHconf); assertTrue(prevHiveObj != newHiveObj); } - + + public void testLoadingHiveMetaStoreClientFactory() throws Throwable { + String factoryClassName = SessionHiveMetaStoreClientFactory.class.getName(); + HiveConf conf = new HiveConf(); + conf.setVar(ConfVars.METASTORE_CLIENT_FACTORY_CLASS, factoryClassName); + // Make sure we instantiate the embedded version + // so the implementation chosen is SessionHiveMetaStoreClient, not a retryable version of it. + conf.setBoolVar(ConfVars.METASTORE_FASTPATH, true); + // The current object was constructed in setUp() before we got here + // so clean that up so we can inject our own dummy implementation of IMetaStoreClient + Hive.closeCurrent(); + Hive hive = Hive.get(conf); + IMetaStoreClient msc = hive.getMSC(); + assertNotNull("getMSC() failed.", msc); + assertThat("Invalid default client implementation created.", msc, + instanceOf(SessionHiveMetaStoreClient.class)); + } + + public void testLoadingInvalidHiveMetaStoreClientFactory() throws Throwable { + // Intentionally invalid class + String factoryClassName = String.class.getName(); + HiveConf conf = new HiveConf(); + conf.setVar(HiveConf.ConfVars.METASTORE_CLIENT_FACTORY_CLASS, factoryClassName); + // The current object was constructed in setUp() before we got here + // so clean that up so we can inject our own dummy implementation of IMetaStoreClient + Hive.closeCurrent(); + Hive hive = Hive.get(conf); + try { + hive.getMSC(); + fail("getMSC() was expected to throw MetaException."); + } catch (Exception e) { + return; + } + } + // shamelessly copied from Path in hadoop-2 private static final String SEPARATOR = "/"; private static final char SEPARATOR_CHAR = '/'; diff --git a/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHiveCopyFiles.java b/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHiveCopyFiles.java index a0c23b632da5..9bd4a1431c1f 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHiveCopyFiles.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/metadata/TestHiveCopyFiles.java @@ -20,6 +20,7 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.exec.HiveDataCommitter; import org.apache.hadoop.hive.ql.session.SessionState; import org.junit.BeforeClass; import org.junit.Rule; @@ -83,7 +84,7 @@ public void testRenameNewFilesOnSameFileSystem() throws IOException { FileSystem targetFs = targetPath.getFileSystem(hiveConf); try { - Hive.copyFiles(hiveConf, sourcePath, targetPath, targetFs, isSourceLocal, NO_ACID, false,null, false, false, false); + new HiveDataCommitter().copyFiles(hiveConf, sourcePath, targetPath, targetFs, isSourceLocal, NO_ACID, false,null, false, false, false); } catch (HiveException e) { e.printStackTrace(); assertTrue("Hive.copyFiles() threw an unexpected exception.", false); @@ -107,7 +108,7 @@ public void testRenameExistingFilesOnSameFileSystem() throws IOException { FileSystem targetFs = targetPath.getFileSystem(hiveConf); try { - Hive.copyFiles(hiveConf, sourcePath, targetPath, targetFs, isSourceLocal, NO_ACID, false, null, false, false, false); + new HiveDataCommitter().copyFiles(hiveConf, sourcePath, targetPath, targetFs, isSourceLocal, NO_ACID, false, null, false, false, false); } catch (HiveException e) { e.printStackTrace(); assertTrue("Hive.copyFiles() threw an unexpected exception.", false); @@ -127,7 +128,7 @@ public void testRenameExistingFilesOnSameFileSystem() throws IOException { sourceFolder.newFile("000001_0.gz"); try { - Hive.copyFiles(hiveConf, sourcePath, targetPath, targetFs, isSourceLocal, NO_ACID, false, null, false, false, false); + new HiveDataCommitter().copyFiles(hiveConf, sourcePath, targetPath, targetFs, isSourceLocal, NO_ACID, false, null, false, false, false); } catch (HiveException e) { e.printStackTrace(); assertTrue("Hive.copyFiles() threw an unexpected exception.", false); @@ -158,7 +159,7 @@ public void testCopyNewFilesOnDifferentFileSystem() throws IOException { Mockito.when(spyTargetFs.getUri()).thenReturn(URI.create("hdfs://" + targetPath.toUri().getPath())); try { - Hive.copyFiles(hiveConf, sourcePath, targetPath, spyTargetFs, isSourceLocal, NO_ACID, false, null, false, false, false); + new HiveDataCommitter().copyFiles(hiveConf, sourcePath, targetPath, spyTargetFs, isSourceLocal, NO_ACID, false, null, false, false, false); } catch (HiveException e) { e.printStackTrace(); assertTrue("Hive.copyFiles() threw an unexpected exception.", false); @@ -185,7 +186,7 @@ public void testCopyExistingFilesOnDifferentFileSystem() throws IOException { Mockito.when(spyTargetFs.getUri()).thenReturn(URI.create("hdfs://" + targetPath.toUri().getPath())); try { - Hive.copyFiles(hiveConf, sourcePath, targetPath, spyTargetFs, isSourceLocal, NO_ACID, false, null, false, false, false); + new HiveDataCommitter().copyFiles(hiveConf, sourcePath, targetPath, spyTargetFs, isSourceLocal, NO_ACID, false, null, false, false, false); } catch (HiveException e) { e.printStackTrace(); assertTrue("Hive.copyFiles() threw an unexpected exception.", false); @@ -205,7 +206,7 @@ public void testCopyExistingFilesOnDifferentFileSystem() throws IOException { sourceFolder.newFile("000001_0.gz"); try { - Hive.copyFiles(hiveConf, sourcePath, targetPath, spyTargetFs, isSourceLocal, NO_ACID, false, null, false, false, false); + new HiveDataCommitter().copyFiles(hiveConf, sourcePath, targetPath, spyTargetFs, isSourceLocal, NO_ACID, false, null, false, false, false); } catch (HiveException e) { e.printStackTrace(); assertTrue("Hive.copyFiles() threw an unexpected exception.", false);