diff --git a/config.toml b/config.toml index 15f9fb97..e9aeb780 100644 --- a/config.toml +++ b/config.toml @@ -30,15 +30,15 @@ theme = 'hive' poweredby = "/general/poweredby/" javaDocs = "/docs/javadocs/" latest = "/docs/latest/" - languageManual = "https://hive.apache.org/docs/latest/language/languagemanual" + languageManual = "/docs/latest/language/languagemanual" license2 = "https://www.apache.org/licenses/LICENSE-2.0.html" privacyPolicy = "/general/privacypolicy/" designDocs = "/development/desingdocs/" hiveJira = "https://issues.apache.org/jira/projects/HIVE/issues" - faq = "https://hive.apache.org/community/resources/hivedeveloperfaq" + faq = "/community/resources/hivedeveloperfaq" vcs = "/development/versioncontrol/" committer = "/community/becomingcommitter/" - contribute = "https://hive.apache.org/community/resources/howtocontribute" + contribute = "/community/resources/howtocontribute" resourcesForDev = "/community/resources/" meetings = "/community/meetings/" mailinglists = "/community/mailinglists/" @@ -52,13 +52,13 @@ theme = 'hive' announcements = "/general/downloads/#23-november-2025--release-420-available" [params.features] - acidTxn = "https://hive.apache.org/docs/latest/user/hive-transactions" - hs2 = "https://hive.apache.org/docs/latest/user/hiveserver2-overview" - hms = "https://hive.apache.org/development/desingdocs/design" - compactions = "https://hive.apache.org/docs/latest/language/languagemanual-ddl#alter-tablepartition-compact" - repl = "https://hive.apache.org/docs/latest/admin/replication" - cbo = "https://hive.apache.org/docs/latest/user/cost-based-optimization-in-hive" - llap = "https://hive.apache.org/development/desingdocs/llap" + acidTxn = "/docs/latest/user/hive-transactions" + hs2 = "/docs/latest/user/hiveserver2-overview" + hms = "/development/desingdocs/design" + compactions = "/docs/latest/language/languagemanual-ddl#alter-tablepartition-compact" + repl = "/docs/latest/admin/replication" + cbo = "/docs/latest/user/cost-based-optimization-in-hive" + llap = "/development/desingdocs/llap" iceberg = "https://iceberg.apache.org/docs/latest/hive/" [outputs] diff --git a/content/Development/desingdocs/design.md b/content/Development/desingdocs/design.md index 41033e53..fdb3d841 100644 --- a/content/Development/desingdocs/design.md +++ b/content/Development/desingdocs/design.md @@ -48,7 +48,7 @@ The Metastore provides two important but often overlooked features of a data war Metastore is an object store with a database or file backed store. The database backed store is implemented using an object-relational mapping (ORM) solution called the [DataNucleus](http://www.datanucleus.org/). The prime motivation for storing this in a relational database is queriability of metadata. Some disadvantages of using a separate data store for metadata instead of using HDFS are synchronization and scalability issues. Additionally there is no clear way to implement an object store on top of HDFS due to lack of random updates to files. This, coupled with the advantages of queriability of a relational store, made our approach a sensible one. -The metastore can be configured to be used in a couple of ways: remote and embedded. In remote mode, the metastore is a [Thrift](https://thrift.apache.org/) service. This mode is useful for non-Java clients. In embedded mode, the Hive client directly connects to an underlying metastore using JDBC. This mode is useful because it avoids another system that needs to be maintained and monitored. Both of these modes can co-exist. (Update: Local metastore is a third possibility. See [Hive Metastore Administration](https://hive.apache.org/docs/latest/admin/adminmanual-metastore-administration) for details.) +The metastore can be configured to be used in a couple of ways: remote and embedded. In remote mode, the metastore is a [Thrift](https://thrift.apache.org/) service. This mode is useful for non-Java clients. In embedded mode, the Hive client directly connects to an underlying metastore using JDBC. This mode is useful because it avoids another system that needs to be maintained and monitored. Both of these modes can co-exist. (Update: Local metastore is a third possibility. See [Hive Metastore Administration](/docs/latest/admin/adminmanual-metastore-administration) for details.) ### Metastore Interface @@ -56,7 +56,7 @@ Metastore provides a [Thrift interface](https://thrift.apache.org/docs/idl) to m ## Hive Query Language -HiveQL is an SQL-like query language for Hive. It mostly mimics SQL syntax for creation of tables, loading data into tables and querying the tables. HiveQL also allows users to embed their custom map-reduce scripts. These scripts can be written in any language using a simple row-based streaming interface – read rows from standard input and write out rows to standard output. This flexibility comes at a cost of a performance hit caused by converting rows from and to strings. However, we have seen that users do not mind this given that they can implement their scripts in the language of their choice. Another feature unique to HiveQL is multi-table insert. In this construct, users can perform multiple queries on the same input data using a single HiveQL query. Hive optimizes these queries to share the scan of the input data, thus increasing the throughput of these queries several orders of magnitude. We omit more details due to lack of space. For a more complete description of the HiveQL language see the [language manual](https://hive.apache.org/docs/latest/language/languagemanual). +HiveQL is an SQL-like query language for Hive. It mostly mimics SQL syntax for creation of tables, loading data into tables and querying the tables. HiveQL also allows users to embed their custom map-reduce scripts. These scripts can be written in any language using a simple row-based streaming interface – read rows from standard input and write out rows to standard output. This flexibility comes at a cost of a performance hit caused by converting rows from and to strings. However, we have seen that users do not mind this given that they can implement their scripts in the language of their choice. Another feature unique to HiveQL is multi-table insert. In this construct, users can perform multiple queries on the same input data using a single HiveQL query. Hive optimizes these queries to share the scan of the input data, thus increasing the throughput of these queries several orders of magnitude. We omit more details due to lack of space. For a more complete description of the HiveQL language see the [language manual](/docs/latest/language/languagemanual). ## Compiler @@ -67,11 +67,11 @@ HiveQL is an SQL-like query language for Hive. It mostly mimics SQL syntax for c ## Optimizer -More plan transformations are performed by the optimizer. The optimizer is an evolving component. As of 2011, it was rule-based and performed the following: column pruning and predicate pushdown. However, the infrastructure was in place, and there was work under progress to include other optimizations like map-side join. (Hive 0.11 added several [join optimizations](https://hive.apache.org/docs/latest/language/languagemanual-joinoptimization).) +More plan transformations are performed by the optimizer. The optimizer is an evolving component. As of 2011, it was rule-based and performed the following: column pruning and predicate pushdown. However, the infrastructure was in place, and there was work under progress to include other optimizations like map-side join. (Hive 0.11 added several [join optimizations](/docs/latest/language/languagemanual-joinoptimization).) - The optimizer can be enhanced to be cost-based (see [Cost-based optimization in Hive](https://hive.apache.org/docs/latest/user/cost-based-optimization-in-hive) and [HIVE-5775](https://issues.apache.org/jira/browse/HIVE-5775)). The sorted nature of output tables can also be preserved and used later on to generate better plans. The query can be performed on a small sample of data to guess the data distribution, which can be used to generate a better plan. + The optimizer can be enhanced to be cost-based (see [Cost-based optimization in Hive](/docs/latest/user/cost-based-optimization-in-hive) and [HIVE-5775](https://issues.apache.org/jira/browse/HIVE-5775)). The sorted nature of output tables can also be preserved and used later on to generate better plans. The query can be performed on a small sample of data to guess the data distribution, which can be used to generate a better plan. - A [correlation optimizer](https://hive.apache.org/development/desingdocs/correlation-optimizer) was added in Hive 0.12. + A [correlation optimizer](/development/desingdocs/correlation-optimizer) was added in Hive 0.12. The plan is a generic operator tree, and can be easily manipulated. diff --git a/content/Development/desingdocs/designdocs.md b/content/Development/desingdocs/designdocs.md index 5524ff7c..332f491c 100644 --- a/content/Development/desingdocs/designdocs.md +++ b/content/Development/desingdocs/designdocs.md @@ -23,22 +23,22 @@ Proposals that appear in the "Completed" and "In Progress" sections should inclu * [Table-level Statistics]({{< ref "statsdev" >}}) ([HIVE-1361](https://issues.apache.org/jira/browse/HIVE-1361)) * [Dynamic Partitions]({{< ref "dynamicpartitions" >}}) * [Binary Data Type]({{< ref "binary-datatype-proposal" >}}) ([HIVE-2380](https://issues.apache.org/jira/browse/HIVE-2380)) -* [Decimal Precision and Scale Support](https://hive.apache.org/attachments/27362075/34177489.pdf) +* [Decimal Precision and Scale Support](/attachments/27362075/34177489.pdf) * [HCatalog]({{< ref "hcatalog-base" >}}) (formerly [Howl]({{< ref "howl" >}})) * [HiveServer2]({{< ref "hiveserver2-thrift-api" >}}) ([HIVE-2935](https://issues.apache.org/jira/browse/HIVE-2935)) * [Column Statistics in Hive]({{< ref "column-statistics-in-hive" >}}) ([HIVE-1362](https://issues.apache.org/jira/browse/HIVE-1362)) * [List Bucketing]({{< ref "listbucketing" >}}) ([HIVE-3026](https://issues.apache.org/jira/browse/HIVE-3026)) * [Group By With Rollup]({{< ref "groupbywithrollup" >}}) ([HIVE-2397](https://issues.apache.org/jira/browse/HIVE-2397)) -* [Enhanced Aggregation, Cube, Grouping and Rollup](https://hive.apache.org/docs/latest/language/enhanced-aggregation-cube-grouping-and-rollup) ([HIVE-3433](https://issues.apache.org/jira/browse/HIVE-3433)) +* [Enhanced Aggregation, Cube, Grouping and Rollup](/docs/latest/language/enhanced-aggregation-cube-grouping-and-rollup) ([HIVE-3433](https://issues.apache.org/jira/browse/HIVE-3433)) * [Optimizing Skewed Joins]({{< ref "skewed-join-optimization" >}}) ([HIVE-3086](https://issues.apache.org/jira/browse/HIVE-3086)) * [Correlation Optimizer]({{< ref "correlation-optimizer" >}}) ([HIVE-2206](https://issues.apache.org/jira/browse/HIVE-2206)) * [Hive on Tez]({{< ref "hive-on-tez" >}}) ([HIVE-4660](https://issues.apache.org/jira/browse/HIVE-4660)) + [Hive-Tez Compatibility]({{< ref "hive-tez-compatibility" >}}) * [Vectorized Query Execution]({{< ref "vectorized-query-execution" >}}) ([HIVE-4160](https://issues.apache.org/jira/browse/HIVE-4160)) -* [Cost Based Optimizer in Hive](https://hive.apache.org/docs/latest/user/cost-based-optimization-in-hive) ([HIVE-5775](https://issues.apache.org/jira/browse/HIVE-5775)) +* [Cost Based Optimizer in Hive](/docs/latest/user/cost-based-optimization-in-hive) ([HIVE-5775](https://issues.apache.org/jira/browse/HIVE-5775)) * [Atomic Insert/Update/Delete](https://issues.apache.org/jira/browse/HIVE-5317) ([HIVE-5317](https://issues.apache.org/jira/browse/HIVE-5317)) * [Transaction Manager](https://issues.apache.org/jira/browse/HIVE-5843) ([HIVE-5843](https://issues.apache.org/jira/browse/HIVE-5843)) -* [SQL Standard based secure authorization](https://hive.apache.org/attachments/27362075/35193122.pdf) ([HIVE-5837](https://issues.apache.org/jira/browse/HIVE-5837)) +* [SQL Standard based secure authorization](/attachments/27362075/35193122.pdf) ([HIVE-5837](https://issues.apache.org/jira/browse/HIVE-5837)) * [Hybrid Hybrid Grace Hash Join]({{< ref "hybrid-grace-hash-join-v1-0" >}}) ([HIVE-9277](https://issues.apache.org/jira/browse/HIVE-9277)) * [LLAP Daemons]({{< ref "llap" >}}) ([HIVE-7926](https://issues.apache.org/jira/browse/HIVE-7926)) * [Support for Hive Replication]({{< ref "hivereplicationdevelopment" >}}) ([HIVE-7973](https://issues.apache.org/jira/browse/HIVE-7973)) @@ -46,7 +46,7 @@ Proposals that appear in the "Completed" and "In Progress" sections should inclu ## In Progress * [Column Level Top K Statistics]({{< ref "top-k-stats" >}}) ([HIVE-3421](https://issues.apache.org/jira/browse/HIVE-3421)) -* [Hive on Spark](https://hive.apache.org/docs/latest/user/hive-on-spark) ([HIVE-7292](https://issues.apache.org/jira/browse/HIVE-7292)) +* [Hive on Spark](/docs/latest/user/hive-on-spark) ([HIVE-7292](https://issues.apache.org/jira/browse/HIVE-7292)) * [Hive on Spark: Join Design (HIVE-7613)]({{< ref "hive-on-spark-join-design-master" >}}) * [Improve ACID Performance](https://issues.apache.org/jira/secure/attachment/12823582/Design.Document.Improving%20ACID%20performance%20in%20Hive.02.docx) – download docx file ([HIVE-14035](https://issues.apache.org/jira/browse/HIVE-14035), [HIVE-14199](https://issues.apache.org/jira/browse/HIVE-14199), [HIVE-14233](https://issues.apache.org/jira/browse/HIVE-14233)) * [Query Results Caching]({{< ref "query-results-caching" >}}) ([HIVE-18513](https://issues.apache.org/jira/browse/HIVE-18513)) @@ -69,7 +69,7 @@ Proposals that appear in the "Completed" and "In Progress" sections should inclu * [Updatable Views]({{< ref "updatableviews" >}}) ([HIVE-1143](https://issues.apache.org/jira/browse/HIVE-1143)) * [Phase 2 of Replication Development]({{< ref "hivereplicationv2development" >}}) ([HIVE-14841](https://issues.apache.org/jira/browse/HIVE-14841)) * [Subqueries in SELECT]({{< ref "subqueries-in-select" >}}) ([HIVE-16091](https://issues.apache.org/jira/browse/HIVE-16091)) -* [DEFAULT keyword](https://hive.apache.org/development/desingdocs/default-keyword) [(HIVE-19059)](https://issues.apache.org/jira/browse/HIVE-19059) +* [DEFAULT keyword](/development/desingdocs/default-keyword) [(HIVE-19059)](https://issues.apache.org/jira/browse/HIVE-19059) * [Hive remote databases/tables]({{< ref "hive-remote-databases-tables" >}}) ## Incomplete diff --git a/content/Development/desingdocs/enabling-grpc-in-hive-metastore.md b/content/Development/desingdocs/enabling-grpc-in-hive-metastore.md index d99cb6b8..30a80485 100644 --- a/content/Development/desingdocs/enabling-grpc-in-hive-metastore.md +++ b/content/Development/desingdocs/enabling-grpc-in-hive-metastore.md @@ -10,7 +10,7 @@ Cameron Moberg (Google), Zhou Fang (Google), Feng Lu (Google), Thejas Nair (Clo # Objective -* To modernize [Hive Metastore’s](https://hive.apache.org/docs/latest/admin/adminmanual-metastore-3-0-administration) interface with a state-of-the-art serving layer based on gRPC while also keeping it backwards compatible with Thrift for minimal upgrade toil; +* To modernize [Hive Metastore’s](/docs/latest/admin/adminmanual-metastore-3-0-administration) interface with a state-of-the-art serving layer based on gRPC while also keeping it backwards compatible with Thrift for minimal upgrade toil; * To achieve this the proposed design is to add support for a proxy-layer between the Thrift interface and a new gRPC interface that allows for in-memory request/response translation in-between; * To expand the Hive client to work with Hive Metastore server in both gRPC and Thrift mode. diff --git a/content/Development/desingdocs/hybrid-grace-hash-join-v1-0.md b/content/Development/desingdocs/hybrid-grace-hash-join-v1-0.md index e38d31d8..c5ec281a 100644 --- a/content/Development/desingdocs/hybrid-grace-hash-join-v1-0.md +++ b/content/Development/desingdocs/hybrid-grace-hash-join-v1-0.md @@ -171,7 +171,7 @@ As of [Hive 2.0.0](https://issues.apache.org/jira/browse/HIVE-11306), a cheap Bl # References * Hybrid Hybrid Grace Hash Join presentation by Mostafa -* MapJoinOptimization +* [MapJoinOptimization](/development/desingdocs/mapjoinoptimization) * [HIVE-1641](https://issues.apache.org/jira/browse/HIVE-1641) add map joined table to distributed cache * [HIVE-1642](https://issues.apache.org/jira/browse/HIVE-1642) Convert join queries to map-join based on size of table/row * Database Management Systems, 3rd ed diff --git a/content/Development/desingdocs/llap.md b/content/Development/desingdocs/llap.md index 09c5b129..857cb62b 100644 --- a/content/Development/desingdocs/llap.md +++ b/content/Development/desingdocs/llap.md @@ -186,7 +186,7 @@ The watch and running nodes options were added in release 2.2.0 with [HIVE-15217 [LLAP Design Document](https://issues.apache.org/jira/secure/attachment/12665704/LLAPdesigndocument.pdf) -[Hive Contributor Meetup Presentation](https://hive.apache.org/attachments/27362054/LLAP-Meetup-Nov.ppsx) +[Hive Contributor Meetup Presentation](/attachments/27362054/LLAP-Meetup-Nov.ppsx) ## Attachments: diff --git a/content/Development/desingdocs/storagehandlers.md b/content/Development/desingdocs/storagehandlers.md index 00c119c4..468762cb 100644 --- a/content/Development/desingdocs/storagehandlers.md +++ b/content/Development/desingdocs/storagehandlers.md @@ -19,7 +19,7 @@ date: 2024-12-12 This page documents the storage handler support being added to Hive as part of work on [HBaseIntegration]({{< ref "hbaseintegration" >}}). The motivation is to make it possible to allow Hive to access data stored and managed by other systems in a modular, extensible fashion. -Besides HBase, a storage handler implementation is also available for [Hypertable](http://code.google.com/p/hypertable/wiki/HiveExtension), and others are being developed for [Cassandra](https://issues.apache.org/jira/browse/HIVE-1434), [Azure Table](https://blogs.msdn.microsoft.com/mostlytrue/2014/04/04/analyzing-azure-table-storage-data-with-hdinsight/), [JDBC](https://hive.apache.org/docs/latest/user/jdbc-storage-handler) (MySQL and others), [MongoDB](https://github.com/yc-huang/Hive-mongo), [ElasticSearch](https://www.elastic.co/guide/en/elasticsearch/hadoop/current/hive.html), [Phoenix HBase](https://phoenix.apache.org/hive_storage_handler.html?platform=hootsuite), [VoltDB](https://issues.voltdb.com/browse/ENG-10736?page=com.atlassian.jira.plugin.system.issuetabpanels%3Aall-tabpanel) and [Google Spreadsheets](https://github.com/balshor/gdata-storagehandler).  A [Kafka handler](https://github.com/HiveKa/HiveKa) demo is available. +Besides HBase, a storage handler implementation is also available for [Hypertable](http://code.google.com/p/hypertable/wiki/HiveExtension), and others are being developed for [Cassandra](https://issues.apache.org/jira/browse/HIVE-1434), [Azure Table](https://blogs.msdn.microsoft.com/mostlytrue/2014/04/04/analyzing-azure-table-storage-data-with-hdinsight/), [JDBC](/docs/latest/user/jdbc-storage-handler) (MySQL and others), [MongoDB](https://github.com/yc-huang/Hive-mongo), [ElasticSearch](https://www.elastic.co/guide/en/elasticsearch/hadoop/current/hive.html), [Phoenix HBase](https://phoenix.apache.org/hive_storage_handler.html?platform=hootsuite), [VoltDB](https://issues.voltdb.com/browse/ENG-10736?page=com.atlassian.jira.plugin.system.issuetabpanels%3Aall-tabpanel) and [Google Spreadsheets](https://github.com/balshor/gdata-storagehandler).  A [Kafka handler](https://github.com/HiveKa/HiveKa) demo is available. Hive storage handler support builds on existing extensibility features in both Hadoop and Hive: @@ -63,7 +63,7 @@ CREATE [EXTERNAL] TABLE [IF NOT EXISTS] table_name ``` -When STORED BY is specified, then row_format (DELIMITED or SERDE) and STORED AS cannot be specified, however starting from [Hive 4.0](https://hive.apache.org/docs/latest/user/hive-iceberg-integration), they can coexist to create the Iceberg table, this is the only exception. Optional SERDEPROPERTIES can be specified as part of the STORED BY clause and will be passed to the serde provided by the storage handler. +When STORED BY is specified, then row_format (DELIMITED or SERDE) and STORED AS cannot be specified, however starting from [Hive 4.0](/docs/latest/user/hive-iceberg-integration), they can coexist to create the Iceberg table, this is the only exception. Optional SERDEPROPERTIES can be specified as part of the STORED BY clause and will be passed to the serde provided by the storage handler. See [CREATE TABLE]({{< ref "#create-table" >}}) and [Row Format, Storage Format, and SerDe]({{< ref "#row-format,-storage-format,-and-serde" >}}) for more information. diff --git a/content/Development/desingdocs/top-k-stats.md b/content/Development/desingdocs/top-k-stats.md index 0e79b402..9d9fbb45 100644 --- a/content/Development/desingdocs/top-k-stats.md +++ b/content/Development/desingdocs/top-k-stats.md @@ -5,12 +5,12 @@ date: 2024-12-12 # Apache Hive : Column Level Top K Statistics -This document is an addition to [Statistics in Hive](https://hive.apache.org/development/desingdocs/statsdev). It describes the support of collecting column level top K values for Hive tables (see [HIVE-3421](https://issues.apache.org/jira/browse/HIVE-3421)). +This document is an addition to [Statistics in Hive](/development/desingdocs/statsdev). It describes the support of collecting column level top K values for Hive tables (see [HIVE-3421](https://issues.apache.org/jira/browse/HIVE-3421)). ## Scope In addition to the partition statistics, column level top K values can also be estimated for Hive tables. - The name and top K values of the most skewed column is stored in the partition or non-partitioned table’s skewed information, if user did not specify [skew](https://hive.apache.org/development/desingdocs/listbucketing). This works for both newly created and existing tables. + The name and top K values of the most skewed column is stored in the partition or non-partitioned table’s skewed information, if user did not specify [skew](/development/desingdocs/listbucketing). This works for both newly created and existing tables. The algorithm for computing top K is based on this paper: [Efficient Computation of Frequent and Top-k Elements in Data Streams](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.114.9563&rep=rep1&type=pdf). ## Implementation diff --git a/content/Development/gettingStarted.md b/content/Development/gettingStarted.md index b20604e0..68aeddcb 100644 --- a/content/Development/gettingStarted.md +++ b/content/Development/gettingStarted.md @@ -62,10 +62,10 @@ project and contribute your expertise. [ISSUE_TRACKING]: /community/issuetracking/ [MAILING_LISTS]: /community/mailinglists/ [HIVE_FACEBOOK]: http://www.facebook.com/pages/Hive/43928506208 -[HIVE_DETAILS]: https://hive.apache.org/docs/latest/introduction-to-apache-hive -[HIVE_QL]: https://hive.apache.org/docs/latest/introduction-to-apache-hive#Home-UserDocumentation +[HIVE_DETAILS]: /docs/latest/introduction-to-apache-hive +[HIVE_QL]: /docs/latest/introduction-to-apache-hive#Home-UserDocumentation [COMMUNITY]: /community/people/ -[CONTRIBUTOR]: https://hive.apache.org/docs/latest/introduction-to-apache-hive#Home-ResourcesforContributors +[CONTRIBUTOR]: /docs/latest/introduction-to-apache-hive#Home-ResourcesforContributors [HIVE_TWITTER]: https://twitter.com/apachehive [DOCKER_QUICKSTART]: /development/quickstart/ diff --git a/content/Development/gettingstarted-latest.md b/content/Development/gettingstarted-latest.md index db2cd551..72b2184a 100644 --- a/content/Development/gettingstarted-latest.md +++ b/content/Development/gettingstarted-latest.md @@ -49,7 +49,7 @@ Finally, add `$HIVE_HOME/bin` to your `PATH`: The Hive GIT repository for the most recent Hive code is located here: `git clone ` (the master branch). -All release versions are in branches named "branch-0.#" or "branch-1.#" or the upcoming "branch-2.#", with the exception of release 0.8.1 which is in "branch-0.8-r2". Any branches with other names are feature branches for works-in-progress. See [Understanding Hive Branches](https://hive.apache.org/community/resources/howtocontribute#understanding-hive-branches) for details. +All release versions are in branches named "branch-0.#" or "branch-1.#" or the upcoming "branch-2.#", with the exception of release 0.8.1 which is in "branch-0.8-r2". Any branches with other names are feature branches for works-in-progress. See [Understanding Hive Branches](/community/resources/howtocontribute#understanding-hive-branches) for details. As of 0.13, Hive is built using [Apache Maven](http://maven.apache.org). @@ -282,7 +282,7 @@ Hive uses log4j for logging. By default logs are not emitted to the console by t The logs are stored in the directory `/tmp/<*user.name*>`: * `/tmp/<*user.name*>/hive.log` -Note: In [local mode](https://hive.apache.org/development/gettingstarted-latest#GettingStarted-Hive,Map-ReduceandLocal-Mode), prior to Hive 0.13.0 the log file name was "`.log`" instead of "`hive.log`". This bug was fixed in release 0.13.0 (see [HIVE-5528](https://issues.apache.org/jira/browse/HIVE-5528) and [HIVE-5676](https://issues.apache.org/jira/browse/HIVE-5676)). +Note: In [local mode](/development/gettingstarted-latest#GettingStarted-Hive,Map-ReduceandLocal-Mode), prior to Hive 0.13.0 the log file name was "`.log`" instead of "`hive.log`". This bug was fixed in release 0.13.0 (see [HIVE-5528](https://issues.apache.org/jira/browse/HIVE-5528) and [HIVE-5676](https://issues.apache.org/jira/browse/HIVE-5676)). To configure a different log location, set `hive.log.dir` in $HIVE_HOME/conf/hive-log4j.properties. Make sure the directory has the sticky bit set (`chmod 1777 <*dir*>`). @@ -543,7 +543,7 @@ Note that for versions of Hive which don't include [HIVE-287](https://issues.apa ``` This streams the data in the map phase through the script `/bin/cat` (like Hadoop streaming). -Similarly – streaming can be used on the reduce side (please see the [Hive Tutorial](https://hive.apache.org/docs/latest/user/tutorial#custom-mapreduce-scripts) for examples). +Similarly – streaming can be used on the reduce side (please see the [Hive Tutorial](/docs/latest/user/tutorial#custom-mapreduce-scripts) for examples). ## Simple Example Use Cases diff --git a/content/Development/quickStart.md b/content/Development/quickStart.md index c3ecff2b..19a4720f 100644 --- a/content/Development/quickStart.md +++ b/content/Development/quickStart.md @@ -292,4 +292,4 @@ exit ``` ### Quick Start with REST Catalog Integration -Checkout the quickstart of REST Catalog Integration with Docker here: [REST Catalog Integration](https://hive.apache.org/docs/latest/quickstart-rest-catalog) +Checkout the quickstart of REST Catalog Integration with Docker here: [REST Catalog Integration](/docs/latest/quickstart-rest-catalog) diff --git a/content/community/meetings/contributorminutes20111205.md b/content/community/meetings/contributorminutes20111205.md index 742425b8..7b73ece3 100644 --- a/content/community/meetings/contributorminutes20111205.md +++ b/content/community/meetings/contributorminutes20111205.md @@ -19,7 +19,7 @@ Carl also proposed that in the future, developers delivering metastore changes s A patch has been committed to make Hive build and run against Hadoop 0.23, and Carl has set up a Jenkins instance for continuous integration. Some tests are still failing, and it is uncertain whether a Hive binary built against 0.20.x will run against Hadoop 0.23. Once tests are all fixed, we'll start requiring committers to keep them working (e.g. if something gets committed which passes tests on 0.20.x, but breaks 0.23, the committer needs to either submit a timely followup to address the breakage, or else back out the original change). There was some discussion about doing the same for Hadoop 0.20.20x, but no resolution. -Ashutosh asked about a registry of available Hive storage handlers, and John referenced the Introduction section in . +Ashutosh asked about a registry of available Hive storage handlers, and John referenced the Introduction section in [/development/desingdocs/storagehandlers](/development/desingdocs/storagehandlers). Code walkthroughs were carried out for HIVE-2616 and HIVE-2589. diff --git a/content/community/meetings/development-contributorsmeetings-hivecontributorsminutes101025.md b/content/community/meetings/development-contributorsmeetings-hivecontributorsminutes101025.md index bc88d5aa..98b89230 100644 --- a/content/community/meetings/development-contributorsmeetings-hivecontributorsminutes101025.md +++ b/content/community/meetings/development-contributorsmeetings-hivecontributorsminutes101025.md @@ -11,7 +11,7 @@ Location: Facebook Palo Alto Attendees: plus Paul, Ning, Yongqiang, Liyin, Basab -The [TLP](http://hive.apache.org) and [bylaws](https://hive.apache.org/community/bylaws/) votes passed, so Hive is now officially an Apache top level project! We are going ahead with moving the following resources: +The [TLP](http://hive.apache.org) and [bylaws](/community/bylaws/) votes passed, so Hive is now officially an Apache top level project! We are going ahead with moving the following resources: * website (now at hive.apache.org) * svn (new trunk location is ); git will follow soon diff --git a/content/community/resources/developerguide.md b/content/community/resources/developerguide.md index 518f7c78..e31fe648 100644 --- a/content/community/resources/developerguide.md +++ b/content/community/resources/developerguide.md @@ -71,7 +71,7 @@ Also: * An [Avro SerDe]({{< ref "avroserde" >}}) was added in Hive 0.9.1.  Starting in Hive 0.14.0 its specification is implicit with the STORED AS AVRO clause. * A SerDe for the [ORC]({{< ref "languagemanual-orc" >}}) file format was added in Hive 0.11.0. * A SerDe for [Parquet]({{< ref "parquet" >}}) was added via plug-in in Hive 0.10 and natively in Hive 0.13.0. -* A SerDe for [CSV](https://hive.apache.org/docs/latest/user/csv-serde) was added in Hive 0.14. +* A SerDe for [CSV](/docs/latest/user/csv-serde) was added in Hive 0.14. See [SerDe]({{< ref "serde" >}}) for detailed information about input and output processing. Also see [Storage Formats]({{< ref "hcatalog-storageformats" >}}) in the [HCatalog manual]({{< ref "hcatalog-base" >}}), including [CTAS Issue with JSON SerDe]({{< ref "#ctas-issue-with-json-serde" >}}). For information about how to create a table with a custom or native SerDe, see [Row Format, Storage Format, and SerDe]({{< ref "#row-format,-storage-format,-and-serde" >}}). diff --git a/content/community/resources/hivedeveloperfaq.md b/content/community/resources/hivedeveloperfaq.md index 5f6ece12..8c6b2ff2 100644 --- a/content/community/resources/hivedeveloperfaq.md +++ b/content/community/resources/hivedeveloperfaq.md @@ -25,8 +25,8 @@ $ perl -i -pe 'sMyClass@g' MyClass.java ## Building -* See [Getting Started: Building Hive from Source](https://hive.apache.org/development/gettingstarted-latest#building-hive-from-source) for detailed information about building Hive releases 0.13 and later with [Maven](http://maven.apache.org/). -* See [Installing from Source Code (Hive 0.12.0 and Earlier)](https://hive.apache.org/docs/latest/admin/adminmanual-installation#installing-from-source-code-hive-0120-and-earlier) for detailed information about building Hive 0.12 and earlier with [Ant](http://ant.apache.org/). +* See [Getting Started: Building Hive from Source](/development/gettingstarted-latest#building-hive-from-source) for detailed information about building Hive releases 0.13 and later with [Maven](http://maven.apache.org/). +* See [Installing from Source Code (Hive 0.12.0 and Earlier)](/docs/latest/admin/adminmanual-installation#installing-from-source-code-hive-0120-and-earlier) for detailed information about building Hive 0.12 and earlier with [Ant](http://ant.apache.org/). ### Maven settings diff --git a/content/community/resources/howtocommit.md b/content/community/resources/howtocommit.md index b350383d..f436536e 100644 --- a/content/community/resources/howtocommit.md +++ b/content/community/resources/howtocommit.md @@ -15,7 +15,7 @@ New committers are encouraged to first read Apache's generic committer documenta * [ASF Project Security for committers](https://www.apache.org/security/committers.html#asf-project-security-for-committers) * [Apache Committer FAQ](http://www.apache.org/dev/committers.html) -The first act of a new core committer is typically to add their name to the [credits](https://hive.apache.org/community/people/) page. This requires changing the source in +The first act of a new core committer is typically to add their name to the [credits](/community/people/) page. This requires changing the source in Committers are strongly encouraged to subscribe to the [security@hive.apache.org]({{< ref "mailto:security-subscribe@hive-apache-org" >}}) list with their Apache email and help addressing security vulnerability reports. diff --git a/content/community/resources/howtocontribute.md b/content/community/resources/howtocontribute.md index f399e6a4..5473ab59 100644 --- a/content/community/resources/howtocontribute.md +++ b/content/community/resources/howtocontribute.md @@ -28,7 +28,7 @@ This is an optional step. Eclipse has a lot of advanced features for Java develo This checklist tells you how to create accounts and obtain permissions needed by Hive contributors. See the [Hive website](http://hive.apache.org/) for additional information. -* Request an Apache Software Foundation [JIRA account](https://hive.apache.org/community/resources/howtocontribute#request-account), if you do not already have one. +* Request an Apache Software Foundation [JIRA account](/community/resources/howtocontribute#request-account), if you do not already have one. + The ASF JIRA system dashboard is [here](https://issues.apache.org/jira/secure/Dashboard.jspa). + The Hive JIRA is [here](https://issues.apache.org/jira/browse/HIVE). * To review patches check the open [pull requests on GitHub](https://github.com/apache/hive/pulls) @@ -343,7 +343,7 @@ Here are the steps relevant to `hive_metastore.thrift`: Stay Involved -Contributors should join the [Hive mailing lists](https://hive.apache.org/community/mailinglists/). In particular the dev list (to join discussions of changes) and the user list (to help others). +Contributors should join the [Hive mailing lists](/community/mailinglists/). In particular the dev list (to join discussions of changes) and the user list (to help others). ## See Also diff --git a/content/community/resources/presentations.md b/content/community/resources/presentations.md index 565f0a60..a071c8fd 100644 --- a/content/community/resources/presentations.md +++ b/content/community/resources/presentations.md @@ -12,7 +12,7 @@ date: 2024-12-12 * [attachments/27362054/61337098-pptx](/attachments/27362054/61337098.pptx) * [attachments/27362054/61337312-ppsx](/attachments/27362054/61337312.ppsx) * [attachments/27362054/61337398-ppsx](/attachments/27362054/61337398.ppsx) -* [Hive on Spark: now and future - Xuefu Zhang](https://hive.apache.org/attachments/27362054/61337443.pdf) +* [Hive on Spark: now and future - Xuefu Zhang](/attachments/27362054/61337443.pdf) ## November 2015 Hive Contributor Meetup diff --git a/content/community/resources/unit-testing-hive-sql.md b/content/community/resources/unit-testing-hive-sql.md index 1357a596..cb238d66 100644 --- a/content/community/resources/unit-testing-hive-sql.md +++ b/content/community/resources/unit-testing-hive-sql.md @@ -27,11 +27,11 @@ There are a number of challenges posed by both Hive and Hive SQL that can make i # Modularisation -By modularising processes implemented using Hive they become easier to test effectively and more resilient to change. Although Hive provides a number of vectors for modularisation it is not always clear how a large process can be decomposed. Features for encapsulation of query logic into components is separated into two perpendicular concerns: column level logic, and set level logic. Column level logic refers to the expressions applied to individual columns or groups of columns in the query, commonly described as ‘functions’. Set level logic concerns Hive SQL constructs that manipulate groupings of data such as: column projection with `SELECT`, `GROUP BY` aggregates, `JOIN`s, `ORDER BY` sorting, etc. In either case we expect individual components to live in their own source file or deployable artifact and imported as needed by the composition. For Hive SQL-based components, the `[SOURCE](https://hive.apache.org/docs/latest/language/languagemanual-cli#hive-interactive-shell-commands)` command provides this functionality. +By modularising processes implemented using Hive they become easier to test effectively and more resilient to change. Although Hive provides a number of vectors for modularisation it is not always clear how a large process can be decomposed. Features for encapsulation of query logic into components is separated into two perpendicular concerns: column level logic, and set level logic. Column level logic refers to the expressions applied to individual columns or groups of columns in the query, commonly described as ‘functions’. Set level logic concerns Hive SQL constructs that manipulate groupings of data such as: column projection with `SELECT`, `GROUP BY` aggregates, `JOIN`s, `ORDER BY` sorting, etc. In either case we expect individual components to live in their own source file or deployable artifact and imported as needed by the composition. For Hive SQL-based components, the [`SOURCE`](/docs/latest/language/languagemanual-cli#hive-interactive-shell-commands) command provides this functionality. ### Encapsulation of column level logic -In the case of column level logic Hive provides both [UDFs](https://hive.apache.org/docs/latest/language/hiveplugins#creating-custom-udfs) and [macros]({{< ref "#macros" >}}) that allow the user to extract and reuse the expressions applied to columns. Once defined, UDFs and Macros can be readily isolated for testing. UDFs can be simply tested with existing Java/Python unit test tools such as JUnit whereas Macros require a Hive command line interface to execute the macro declaration and then exercise it with some sample `SELECT` statements. +In the case of column level logic Hive provides both [UDFs](/docs/latest/language/hiveplugins#creating-custom-udfs) and [macros]({{< ref "#macros" >}}) that allow the user to extract and reuse the expressions applied to columns. Once defined, UDFs and Macros can be readily isolated for testing. UDFs can be simply tested with existing Java/Python unit test tools such as JUnit whereas Macros require a Hive command line interface to execute the macro declaration and then exercise it with some sample `SELECT` statements. ### Encapsulation of set level logic @@ -98,9 +98,9 @@ The following Hive specific practices can be used to make processes more amenabl * Modularise large or complex queries into multiple smaller components. These are easier to comprehend, maintain, and test. * Use macros or UDFs to encapsulate repeated or complex column expressions. -* Use [Hive variables](https://hive.apache.org/docs/latest/language/languagemanual-variablesubstitution) to decouple SQL scripts from specific environments. For example it might be wise to use `LOCATION ${myTableLocation}` in preference to `LOCATION /hard/coded/path`. +* Use [Hive variables](/docs/latest/language/languagemanual-variablesubstitution) to decouple SQL scripts from specific environments. For example it might be wise to use `LOCATION ${myTableLocation}` in preference to `LOCATION /hard/coded/path`. * Keep the scope of tests small. Making coarse assertions on the entire contents of a table is brittle and has a high maintenance requirement. -* Use the `[SOURCE](https://hive.apache.org/docs/latest/language/languagemanual-cli#hive-interactive-shell-commands)` command to combine multiple smaller SQL scripts. +* Use the [`SOURCE`](/docs/latest/language/languagemanual-cli#hive-interactive-shell-commands) command to combine multiple smaller SQL scripts. * Test macros and the integration of UDFs by creating simple test tables and applying the functions to columns in those tables. * Test UDFs by invoking the lifecycle methods directly (`initialize`, `evaluate`, etc.) in a standard testing framework such as JUnit. diff --git a/content/docs/latest/admin/adminmanual-configuration.md b/content/docs/latest/admin/adminmanual-configuration.md index 7026f3a9..b7dd3a55 100644 --- a/content/docs/latest/admin/adminmanual-configuration.md +++ b/content/docs/latest/admin/adminmanual-configuration.md @@ -108,7 +108,7 @@ Version information: Metrics | hive.resource.use.hdfs.location | Reference HDFS based files/jars directly instead of copying to session based HDFS scratch directory. (As of Hive [2.2.1](https://issues.apache.org/jira/browse/HIVE-17574).) | true | | hive.jar.path | The location of hive_cli.jar that is used when submitting jobs in a separate jvm. |   | | hive.aux.jars.path | The location of the plugin jars that contain implementations of user defined functions and SerDes. |   | -| hive.reloadable.aux.jars.path | The location of plugin jars that can be renewed (added, removed, or updated) by executing the [Beeline reload command]({{< ref "#beeline-reload-command" >}}), without having to restart HiveServer2. These jars can be used just like the auxiliary classes in hive.aux.jars.path[for creating UDFs or SerDes](https://hive.apache.org/docs/latest/user/configuration-properties#hiveauxjarspath). (As of Hive [0.14.0](https://issues.apache.org/jira/browse/HIVE-7553).) | | +| hive.reloadable.aux.jars.path | The location of plugin jars that can be renewed (added, removed, or updated) by executing the [Beeline reload command]({{< ref "#beeline-reload-command" >}}), without having to restart HiveServer2. These jars can be used just like the auxiliary classes in hive.aux.jars.path[for creating UDFs or SerDes](/docs/latest/user/configuration-properties#hiveauxjarspath). (As of Hive [0.14.0](https://issues.apache.org/jira/browse/HIVE-7553).) | | | hive.partition.pruning | A strict value for this variable indicates that an error is thrown by the compiler in case no partition predicate is provided on a partitioned table. This is used to protect against a user inadvertently issuing a query against all the partitions of the table. | nonstrict | | hive.map.aggr | Determines whether the map side aggregation is on or not. | true | | hive.join.emit.interval |   | 1000 | @@ -134,7 +134,7 @@ Version information: Metrics Please see [Hive Metastore Administration]({{< ref "adminmanual-metastore-administration" >}}) for information about the configuration variables used to set up the metastore in local, remote, or embedded mode. Also see descriptions in the [Metastore]({{< ref "#metastore" >}}) section of the Language Manual's [Hive Configuration Properties]({{< ref "configuration-properties" >}}). -For security configuration (Hive 0.10 and later), see the [Hive Metastore Security](https://hive.apache.org/docs/latest/user/configuration-properties#hive-metastore-security) section in the Language Manual's [Hive Configuration Properties]({{< ref "configuration-properties" >}}). +For security configuration (Hive 0.10 and later), see the [Hive Metastore Security](/docs/latest/user/configuration-properties#hive-metastore-security) section in the Language Manual's [Hive Configuration Properties]({{< ref "configuration-properties" >}}). #### Configuration Variables Used to Interact with Hadoop @@ -203,7 +203,7 @@ Starting in Hive release 0.11.0, HCatalog is installed and configured with Hive. * See [Hive Metastore Administration]({{< ref "adminmanual-metastore-administration" >}}) for metastore configuration properties. * See [HCatalog Installation from Tarball]({{< ref "hcatalog-installhcat" >}}) for additional information. -For Hive releases prior to 0.11.0, see the "Thrift Server Setup" section in the HCatalog 0.5.0 document [Installation from Tarball](http://hive.apache.org/docs/hcat_r0.5.0/install.html). +For Hive releases prior to 0.11.0, see the "Thrift Server Setup" section in the HCatalog 0.5.0 document [Installation from Tarball](/docs/latest/hcatalog/hcatalog-installhcat/). ### WebHCat diff --git a/content/docs/latest/admin/adminmanual-installation.md b/content/docs/latest/admin/adminmanual-installation.md index e6663e80..e1804a23 100644 --- a/content/docs/latest/admin/adminmanual-installation.md +++ b/content/docs/latest/admin/adminmanual-installation.md @@ -107,7 +107,7 @@ $ bin/hive The Hive home directory is the one with the contents of build/dist for Hive 0.12 and earlier; for Hive 0.13 and later it is packaging/target/apache-hive-**-bin/apache-hive-**-bin/. -HiveServer2 (introduced in Hive 0.11) has a new CLI called Beeline (see [Beeline – New Command Line Shell](https://hive.apache.org/docs/latest/user/hiveserver2-clients#beeline--command-line-shell)). To use Beeline, execute the following command in the Hive home directory: +HiveServer2 (introduced in Hive 0.11) has a new CLI called Beeline (see [Beeline – New Command Line Shell](/docs/latest/user/hiveserver2-clients#beeline--command-line-shell)). To use Beeline, execute the following command in the Hive home directory: ``` $ bin/beeline diff --git a/content/docs/latest/admin/adminmanual-metastore-administration.md b/content/docs/latest/admin/adminmanual-metastore-administration.md index 9b368a21..c965b66b 100644 --- a/content/docs/latest/admin/adminmanual-metastore-administration.md +++ b/content/docs/latest/admin/adminmanual-metastore-administration.md @@ -27,9 +27,9 @@ Configuration options for **metastore server**: #### Basic Configuration Parameters -The relevant configuration parameters are shown here. (Non-metastore parameters are described in [Configuring Hive](https://hive.apache.org/docs/latest/admin/adminmanual-configuration). Also see the Language Manual's [Hive Configuration Properties](https://hive.apache.org/docs/latest/user/configuration-properties), including [Metastore](https://hive.apache.org/docs/latest/user/configuration-properties#metastore) and [Hive Metastore Security](https://hive.apache.org/docs/latest/user/configuration-properties#hive-metastore-security).) +The relevant configuration parameters are shown here. (Non-metastore parameters are described in [Configuring Hive](/docs/latest/admin/adminmanual-configuration). Also see the Language Manual's [Hive Configuration Properties](/docs/latest/user/configuration-properties), including [Metastore](/docs/latest/user/configuration-properties#metastore) and [Hive Metastore Security](/docs/latest/user/configuration-properties#hive-metastore-security).) -Also see hivemetastore-site.xml documentation under [Configuring Hive](https://hive.apache.org/docs/latest/admin/adminmanual-configuration). +Also see hivemetastore-site.xml documentation under [Configuring Hive](/docs/latest/admin/adminmanual-configuration). | Configuration Parameter | Description | | --- | --- | @@ -43,7 +43,7 @@ The Hive metastore is stateless and thus there can be multiple instances to achi #### Additional Configuration Parameters -The following metastore configuration parameters were carried over from old documentation without a guarantee that they all still exist. See the `HiveConf` Java class for current Hive configuration options, and see the [Metastore](https://hive.apache.org/docs/latest/user/configuration-properties#metastore) and [Hive Metastore Security](https://hive.apache.org/docs/latest/user/configuration-properties#hive-metastore-security) sections of the Language Manual's [Hive Configuration Properties](https://hive.apache.org/docs/latest/user/configuration-properties) for user-friendly descriptions of the metastore parameters. +The following metastore configuration parameters were carried over from old documentation without a guarantee that they all still exist. See the `HiveConf` Java class for current Hive configuration options, and see the [Metastore](/docs/latest/user/configuration-properties#metastore) and [Hive Metastore Security](/docs/latest/user/configuration-properties#hive-metastore-security) sections of the Language Manual's [Hive Configuration Properties](/docs/latest/user/configuration-properties) for user-friendly descriptions of the metastore parameters. | Configuration Parameter | Description | Default Value | | --- | --- | --- | @@ -91,7 +91,7 @@ Derby is the default database for the embedded metastore. | javax.jdo.option.ConnectionDriverName | `org.apache.derby.jdbc.EmbeddedDriver` | Derby embeded JDBC driver class. | | hive.metastore.warehouse.dir | `file://${user.dir}/../build/ql/test/data/warehouse` | Unit test data goes in here on your local filesystem. | -If you want to run Derby as a network server so the metastore can be accessed from multiple nodes, see [Hive Using Derby in Server Mode](https://hive.apache.org/docs/latest/admin/hivederbyservermode). +If you want to run Derby as a network server so the metastore can be accessed from multiple nodes, see [Hive Using Derby in Server Mode](/docs/latest/admin/hivederbyservermode). ### Remote Metastore Database diff --git a/content/docs/latest/admin/adminmanual-settinguphiveserver.md b/content/docs/latest/admin/adminmanual-settinguphiveserver.md index 88e711f8..5b6f5a78 100644 --- a/content/docs/latest/admin/adminmanual-settinguphiveserver.md +++ b/content/docs/latest/admin/adminmanual-settinguphiveserver.md @@ -7,7 +7,7 @@ date: 2024-12-12 ## Setting Up Hive Server -* [Setting Up HiveServer2](https://hive.apache.org/docs/latest/admin/setting-up-hiveserver2) +* [Setting Up HiveServer2](/docs/latest/admin/setting-up-hiveserver2) * [Setting Up Thrift Hive Server]({{< ref "hiveserver" >}}) * [Setting Up Hive JDBC Server]({{< ref "hivejdbcinterface" >}}) * [Setting Up Hive ODBC Server]({{< ref "hiveodbc" >}}) diff --git a/content/docs/latest/admin/hive-on-spark-getting-started.md b/content/docs/latest/admin/hive-on-spark-getting-started.md index b12da289..cee3ddb5 100644 --- a/content/docs/latest/admin/hive-on-spark-getting-started.md +++ b/content/docs/latest/admin/hive-on-spark-getting-started.md @@ -223,12 +223,12 @@ spark.kryo.referenceTracking=false spark.kryo.classesToRegister=org.apache.hadoop.hive.ql.io.HiveKey,org.apache.hadoop.io.BytesWritable,org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch ``` -See [Spark section of configuration page](https://hive.apache.org/docs/latest/user/configuration-properties#spark) for additional properties. +See [Spark section of configuration page](/docs/latest/user/configuration-properties#spark) for additional properties. ## Design documents * [Hive on Spark: Overall Design](https://issues.apache.org/jira/secure/attachment/12652517/Hive-on-Spark.pdf) from [HIVE-7272](https://issues.apache.org/jira/browse/HIVE-7292) -* [Hive on Spark: Join Design (HIVE-7613)](https://hive.apache.org/development/desingdocs/hive-on-spark-join-design-master) +* [Hive on Spark: Join Design (HIVE-7613)](/development/desingdocs/hive-on-spark-join-design-master) * [Hive on Spark Configuration (HIVE-9449)](https://issues.apache.org/jira/browse/HIVE-9449) * [attachments/44302539/53575687.pdf](/attachments/44302539/53575687.pdf) diff --git a/content/docs/latest/admin/manual-installation.md b/content/docs/latest/admin/manual-installation.md index 91e21b0c..d753d1d8 100644 --- a/content/docs/latest/admin/manual-installation.md +++ b/content/docs/latest/admin/manual-installation.md @@ -459,7 +459,7 @@ You can begin using Hive as soon as it is installed, it should be work on you co ## Beeline CLI -HiveServer2 has a CLI called Beeline (see [Beeline – New Command Line Shell](https://hive.apache.org/docs/latest/user/hiveserver2-clients#beeline--command-line-shell)). To use Beeline, execute the following command in the Hive home directory: +HiveServer2 has a CLI called Beeline (see [Beeline – New Command Line Shell](/docs/latest/user/hiveserver2-clients#beeline--command-line-shell)). To use Beeline, execute the following command in the Hive home directory: ``` $ bin/beeline diff --git a/content/docs/latest/admin/setting-up-hiveserver2.md b/content/docs/latest/admin/setting-up-hiveserver2.md index 9c213be7..eff94db0 100644 --- a/content/docs/latest/admin/setting-up-hiveserver2.md +++ b/content/docs/latest/admin/setting-up-hiveserver2.md @@ -5,7 +5,7 @@ date: 2024-12-12 # Apache Hive : Setting Up HiveServer2 -[HiveServer2](https://hive.apache.org/docs/latest/user/hiveserver2-overview) (HS2) is a server interface that enables remote clients to execute queries against Hive and retrieve the results (a more detailed intro [here](https://hive.apache.org/docs/latest/user/hiveserver2-overview)). The current implementation, based on Thrift RPC, is an improved version of [HiveServer]({{< ref "hiveserver" >}}) and supports multi-client concurrency and authentication. It is designed to provide better support for open API clients like JDBC and ODBC. +[HiveServer2](/docs/latest/user/hiveserver2-overview) (HS2) is a server interface that enables remote clients to execute queries against Hive and retrieve the results (a more detailed intro [here](/docs/latest/user/hiveserver2-overview)). The current implementation, based on Thrift RPC, is an improved version of [HiveServer]({{< ref "hiveserver" >}}) and supports multi-client concurrency and authentication. It is designed to provide better support for open API clients like JDBC and ODBC. * The Thrift interface definition language (IDL) for HiveServer2 is available at . * Thrift documentation is available at . @@ -37,7 +37,7 @@ HIVE_SERVER2_THRIFT_PORT – Optional TCP port number to listen on, default 1000 #### **Running in HTTP Mode** -HiveServer2 provides support for sending Thrift RPC messages over HTTP transport (Hive 0.13 onward, see [HIVE-4752](https://issues.apache.org/jira/browse/HIVE-4752)). This is particularly useful to support a proxying intermediary between the client and the server (for example, for load balancing or security reasons). Currently, you can run HiveServer2 in either TCP mode or the HTTP mode, but not in both. For the corresponding JDBC URL check this link: [HiveServer2 Clients -- JDBC Connection URLs](https://hive.apache.org/docs/latest/user/hiveserver2-clients#jdbc). Use the following settings to enable and configure HTTP mode: +HiveServer2 provides support for sending Thrift RPC messages over HTTP transport (Hive 0.13 onward, see [HIVE-4752](https://issues.apache.org/jira/browse/HIVE-4752)). This is particularly useful to support a proxying intermediary between the client and the server (for example, for load balancing or security reasons). Currently, you can run HiveServer2 in either TCP mode or the HTTP mode, but not in both. For the corresponding JDBC URL check this link: [HiveServer2 Clients -- JDBC Connection URLs](/docs/latest/user/hiveserver2-clients#jdbc). Use the following settings to enable and configure HTTP mode: | Setting | Default | Description | | --- | --- | --- | @@ -49,11 +49,11 @@ HiveServer2 provides support for sending Thrift RPC messages over HTTP transport ##### Cookie Based Authentication -[HIVE-9709](https://issues.apache.org/jira/browse/HIVE-9709) and [HIVE-9710](https://issues.apache.org/jira/browse/HIVE-9710) introduced cookie based authentication for HiveServer2 in HTTP mode. The HiveServer2 parameters (hive.server2.thrift.http.cookie.*) associated with this change can be found [here](https://hive.apache.org/docs/latest/user/configuration-properties#hiveserver2thrifthttpcookieauthenabled). +[HIVE-9709](https://issues.apache.org/jira/browse/HIVE-9709) and [HIVE-9710](https://issues.apache.org/jira/browse/HIVE-9710) introduced cookie based authentication for HiveServer2 in HTTP mode. The HiveServer2 parameters (hive.server2.thrift.http.cookie.*) associated with this change can be found [here](/docs/latest/user/configuration-properties#hiveserver2thrifthttpcookieauthenabled). #### Optional Global Init File -A global init file can be placed in the configured [hive.server2.global.init.file.location](https://hive.apache.org/docs/latest/user/configuration-properties#hiveserver2globalinitfilelocation) location (Hive 0.14 onward, see [HIVE-5160](https://issues.apache.org/jira/browse/HIVE-5160), [HIVE-7497](https://issues.apache.org/jira/browse/HIVE-7497), and [HIVE-8138](https://issues.apache.org/jira/browse/HIVE-8138)). This can be either the path to the init file itself, or a directory where an init file named ".hiverc" is expected. +A global init file can be placed in the configured [hive.server2.global.init.file.location](/docs/latest/user/configuration-properties#hiveserver2globalinitfilelocation) location (Hive 0.14 onward, see [HIVE-5160](https://issues.apache.org/jira/browse/HIVE-5160), [HIVE-7497](https://issues.apache.org/jira/browse/HIVE-7497), and [HIVE-8138](https://issues.apache.org/jira/browse/HIVE-8138)). This can be either the path to the init file itself, or a directory where an init file named ".hiverc" is expected. The init file lists a set of commands that will run for users of this HiveServer2 instance, such as register a standard set of jars and functions. diff --git a/content/docs/latest/hcatalog/hcatalog-authorization.md b/content/docs/latest/hcatalog/hcatalog-authorization.md index 345e269b..ae86d06c 100644 --- a/content/docs/latest/hcatalog/hcatalog-authorization.md +++ b/content/docs/latest/hcatalog/hcatalog-authorization.md @@ -48,7 +48,7 @@ Details of HDFS permissions are given at `ht``tp://hadoop.apache.org/docs/r`*x.x Links to documentation for different releases of Hadoop can be found here: . -**Note**: If [hive.warehouse.subdir.inherit.perms](https://hive.apache.org/docs/latest/user/configuration-properties#hivewarehousesubdirinheritperms) is enabled, permissions and ACL's for Hive-created files and directories will be set via the following [permission inheritance]({{< ref "permission-inheritance-in-hive" >}}) rules. +**Note**: If [hive.warehouse.subdir.inherit.perms](/docs/latest/user/configuration-properties#hivewarehousesubdirinheritperms) is enabled, permissions and ACL's for Hive-created files and directories will be set via the following [permission inheritance]({{< ref "permission-inheritance-in-hive" >}}) rules. The file system’s logic for determining if a user has permission on the directory or file will be used by Hive.  diff --git a/content/docs/latest/hcatalog/hcatalog-installhcat.md b/content/docs/latest/hcatalog/hcatalog-installhcat.md index 52f05b9f..ff10cc47 100644 --- a/content/docs/latest/hcatalog/hcatalog-installhcat.md +++ b/content/docs/latest/hcatalog/hcatalog-installhcat.md @@ -36,7 +36,7 @@ HCatalog server is the same as Hive metastore. You can just follow the [Hive met Previous: [Using HCatalog]({{< ref "hcatalog-usinghcat" >}}) Next: [HCatalog Configuration Properties]({{< ref "hcatalog-configuration-properties" >}}) -Hive installation and configuration: [Installing Hive]({{< ref "adminmanual-installation" >}}), [Configuring Hive]({{< ref "adminmanual-configuration" >}}), [Hive Configuration Properties](https://hive.apache.org/docs/latest/user/configuration-properties) +Hive installation and configuration: [Installing Hive]({{< ref "adminmanual-installation" >}}), [Configuring Hive]({{< ref "adminmanual-configuration" >}}), [Hive Configuration Properties](/docs/latest/user/configuration-properties) WebHCat installation and configuration: [WebHCat Installation]({{< ref "webhcat-installwebhcat" >}}), [WebHCat Configuration]({{< ref "webhcat-configure" >}}) diff --git a/content/docs/latest/hcatalog/hcatalog-streaming-mutation-api.md b/content/docs/latest/hcatalog/hcatalog-streaming-mutation-api.md index 65ba2c30..503f4203 100644 --- a/content/docs/latest/hcatalog/hcatalog-streaming-mutation-api.md +++ b/content/docs/latest/hcatalog/hcatalog-streaming-mutation-api.md @@ -5,7 +5,7 @@ date: 2024-12-12 # Apache Hive : HCatalog Streaming Mutation API -A Java API focused on mutating (insert/update/delete) records into transactional tables using Hive’s [ACID](https://hive.apache.org/docs/latest/user/hive-transactions) feature. It is introduced in Hive 2.0.0 ([HIVE-10165](https://issues.apache.org/jira/browse/HIVE-10165)). +A Java API focused on mutating (insert/update/delete) records into transactional tables using Hive’s [ACID](/docs/latest/user/hive-transactions) feature. It is introduced in Hive 2.0.0 ([HIVE-10165](https://issues.apache.org/jira/browse/HIVE-10165)). # Background diff --git a/content/docs/latest/introduction-to-apache-hive.md b/content/docs/latest/introduction-to-apache-hive.md index e3a87475..4137610e 100644 --- a/content/docs/latest/introduction-to-apache-hive.md +++ b/content/docs/latest/introduction-to-apache-hive.md @@ -12,12 +12,12 @@ Built on top of **[Apache Hadoop™](http://hadoop.apache.org/)**, Hive provide * Access to files stored either directly in **[Apache HDFS](http://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsUserGuide.html)[™](http://hadoop.apache.org/)** or in other data storage systems such as **[Apache HBase](http://hbase.apache.org/)[™](http://hadoop.apache.org/)** * Query execution via [Apache Tez](http://tez.apache.org/)**[™](http://hadoop.apache.org/)** or [MapReduce](http://hadoop.apache.org/docs/current/hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapReduceTutorial.html) * Procedural language with HPL-SQL -* Sub-second query retrieval via [Hive LLAP](https://hive.apache.org/development/desingdocs/llap), [Apache YARN](https://hadoop.apache.org/docs/r2.7.2/hadoop-yarn/hadoop-yarn-site/YARN.html) and [Apache Slider](https://slider.incubator.apache.org/). +* Sub-second query retrieval via [Hive LLAP](/development/desingdocs/llap), [Apache YARN](https://hadoop.apache.org/docs/r2.7.2/hadoop-yarn/hadoop-yarn-site/YARN.html) and [Apache Slider](https://slider.incubator.apache.org/). -[Hive provides standard SQL functionality](https://hive.apache.org/docs/latest/language/apache-hive-sql-conformance), including many of the later [SQL:2003](https://en.wikipedia.org/wiki/SQL:2003), [SQL:2011](https://en.wikipedia.org/wiki/SQL:2011), and [SQL:2016](https://en.wikipedia.org/wiki/SQL:2016) features for analytics. +[Hive provides standard SQL functionality](/docs/latest/language/apache-hive-sql-conformance), including many of the later [SQL:2003](https://en.wikipedia.org/wiki/SQL:2003), [SQL:2011](https://en.wikipedia.org/wiki/SQL:2011), and [SQL:2016](https://en.wikipedia.org/wiki/SQL:2016) features for analytics. Hive's SQL can also be extended with user code via user defined functions (UDFs), user defined aggregates (UDAFs), and user defined table functions (UDTFs). -There is not a single "Hive format" in which data must be stored. Hive comes with built in connectors for comma and tab-separated values (CSV/TSV) text files, [Apache Parquet](http://parquet.apache.org/)**[™](http://hadoop.apache.org/)**, [Apache ORC](http://orc.apache.org/)**[™](http://hadoop.apache.org/)**, and other formats. Users can extend Hive with connectors for other formats. Please see [File Formats](https://hive.apache.org/community/resources/developerguide#file-formats) and [Hive SerDe](https://hive.apache.org/community/resources/developerguide#hive-serde) in the [Developer Guide](https://hive.apache.org/community/resources/developerguide) for details. +There is not a single "Hive format" in which data must be stored. Hive comes with built in connectors for comma and tab-separated values (CSV/TSV) text files, [Apache Parquet](http://parquet.apache.org/)**[™](http://hadoop.apache.org/)**, [Apache ORC](http://orc.apache.org/)**[™](http://hadoop.apache.org/)**, and other formats. Users can extend Hive with connectors for other formats. Please see [File Formats](/community/resources/developerguide#file-formats) and [Hive SerDe](/community/resources/developerguide#hive-serde) in the [Developer Guide](/community/resources/developerguide) for details. Hive is not designed for online transaction processing (OLTP) workloads. It is best used for traditional data warehousing tasks. @@ -25,8 +25,8 @@ Hive is designed to maximize scalability (scale out with more machines added dyn Components of Hive include HCatalog and WebHCat. -* **[HCatalog](https://hive.apache.org/docs/latest/hcatalog/hcatalog-base)** is a table and storage management layer for Hadoop that enables users with different data processing tools — including Pig and MapReduce — to more easily read and write data on the grid. -* **[WebHCat](https://hive.apache.org/docs/latest/webhcat/webhcat-base)** provides a service that you can use to run Hadoop MapReduce (or YARN), Pig, Hive jobs. You can also perform Hive metadata operations using an HTTP (REST style) interface. +* **[HCatalog](/docs/latest/hcatalog/hcatalog-base)** is a table and storage management layer for Hadoop that enables users with different data processing tools — including Pig and MapReduce — to more easily read and write data on the grid. +* **[WebHCat](/docs/latest/webhcat/webhcat-base)** provides a service that you can use to run Hadoop MapReduce (or YARN), Pig, Hive jobs. You can also perform Hive metadata operations using an HTTP (REST style) interface. diff --git a/content/docs/latest/language/apache-hive-sql-conformance.md b/content/docs/latest/language/apache-hive-sql-conformance.md index 06fbc712..6dda6d3d 100644 --- a/content/docs/latest/language/apache-hive-sql-conformance.md +++ b/content/docs/latest/language/apache-hive-sql-conformance.md @@ -13,8 +13,8 @@ The formal name of the current SQL standard is ISO/IEC 9075 "Database Language S | Version | Supported SQL Features | | --- | --- | -| Apache Hive 2.1 | [Supported SQL Features](https://hive.apache.org/docs/latest/language/supported-features-apache-hive-2-1) | -| Apache Hive 2.3 | [Supported SQL Features](https://hive.apache.org/docs/latest/language/supported-features-apache-hive-2-3) | +| Apache Hive 2.1 | [Supported SQL Features](/docs/latest/language/supported-features-apache-hive-2-1) | +| Apache Hive 2.3 | [Supported SQL Features](/docs/latest/language/supported-features-apache-hive-2-3) | | Apache Hive 3.1 | [Supported SQL Features]({{< ref "supported-features" >}}) | Information in these pages is not guaranteed to be accurate. Corrections can be submitted to the Apache Hive mailing list at [user@hive.apache.org]({{< ref "mailto:user@hive-apache-org" >}}). diff --git a/content/docs/latest/language/compaction-pooling.md b/content/docs/latest/language/compaction-pooling.md index b41409c9..227125d9 100644 --- a/content/docs/latest/language/compaction-pooling.md +++ b/content/docs/latest/language/compaction-pooling.md @@ -21,7 +21,7 @@ Databases, tables and partitions can be assigned to compaction pools through the hive.compactor.worker.pool={pool_name} ``` -[Database/Table property](https://hive.apache.org/docs/latest/user/hive-transactions#table-properties). If the property is set on Database level, it applies to all tables and partitions. The pool also can be assigned on a table/partition level, in this case it overrides the Database level value (if set).  +[Database/Table property](/docs/latest/user/hive-transactions#table-properties). If the property is set on Database level, it applies to all tables and partitions. The pool also can be assigned on a table/partition level, in this case it overrides the Database level value (if set).  If any of the above is set, it is used by the *Initiator* during the creation of the compaction requests. #### Manual pool assignment diff --git a/content/docs/latest/language/enhanced-aggregation-cube-grouping-and-rollup.md b/content/docs/latest/language/enhanced-aggregation-cube-grouping-and-rollup.md index d4c04110..293a792b 100644 --- a/content/docs/latest/language/enhanced-aggregation-cube-grouping-and-rollup.md +++ b/content/docs/latest/language/enhanced-aggregation-cube-grouping-and-rollup.md @@ -18,7 +18,7 @@ Version GROUPING__ID is compliant with semantics in other SQL engines starting in Hive 2.3.0 (see [HIVE-16102](https://issues.apache.org/jira/browse/HIVE-16102)). Support for SQL *grouping* function was added in Hive 2.3.0 too (see [HIVE-15409](https://issues.apache.org/jira/browse/HIVE-15409)). -For general information about GROUP BY, see [GroupBy](https://hive.apache.org/docs/latest/language/languagemanual-groupby) in the Language Manual. +For general information about GROUP BY, see [GroupBy](/docs/latest/language/languagemanual-groupby) in the Language Manual. ### GROUPING SETS clause diff --git a/content/docs/latest/language/hive-operators.md b/content/docs/latest/language/hive-operators.md index 5937f033..49dc4c53 100644 --- a/content/docs/latest/language/hive-operators.md +++ b/content/docs/latest/language/hive-operators.md @@ -51,7 +51,7 @@ The following operators support various common arithmetic operations on the oper | A + B | All number types | Gives the result of adding A and B. The type of the result is the same as the common parent(in the type hierarchy) of the types of the operands. For example, since every integer is a float, therefore float is a containing type of integer so the + operator on a float and an int will result in a float. | | A - B | All number types | Gives the result of subtracting B from A. The type of the result is the same as the common parent(in the type hierarchy) of the types of the operands. | | A * B | All number types | Gives the result of multiplying A and B. The type of the result is the same as the common parent(in the type hierarchy) of the types of the operands. Note that if the multiplication causes overflow, you will have to cast one of the operators to a type higher in the type hierarchy. | -| A / B | All number types | Gives the result of dividing A by B. The result is a double type in most cases. When A and B are both integers, the result is a double type except when the [hive.compat](https://hive.apache.org/docs/latest/user/configuration-properties#hivecompat) configuration parameter is set to "0.13" or "latest" in which case the result is a decimal type. | +| A / B | All number types | Gives the result of dividing A by B. The result is a double type in most cases. When A and B are both integers, the result is a double type except when the [hive.compat](/docs/latest/user/configuration-properties#hivecompat) configuration parameter is set to "0.13" or "latest" in which case the result is a decimal type. | | A DIV B | Integer types | Gives the integer part resulting from dividing A by B. E.g 17 div 3 results in 5. | | A % B | All number types | Gives the remainder resulting from dividing A by B. The type of the result is the same as the common parent(in the type hierarchy) of the types of the operands. | | A & B | All number types | Gives the result of bitwise AND of A and B. The type of the result is the same as the common parent(in the type hierarchy) of the types of the operands. | diff --git a/content/docs/latest/language/languagemanual-commands.md b/content/docs/latest/language/languagemanual-commands.md index 8924fad8..12939be1 100644 --- a/content/docs/latest/language/languagemanual-commands.md +++ b/content/docs/latest/language/languagemanual-commands.md @@ -24,7 +24,7 @@ Commands are non-SQL statements such as setting a property or adding a resource. | dfs | Executes a dfs command from the Hive shell. | | | Executes a Hive query and prints results to standard output. | | source FILE | Executes a script file inside the CLI. | -| compile `` AS GROOVY NAMED | This allows inline Groovy code to be compiled and be used as a UDF (as of Hive [0.13.0](https://issues.apache.org/jira/browse/HIVE-5252)). For a usage example, see [Nov. 2013 Hive Contributors Meetup Presentations – Using Dynamic Compilation with Hive](https://hive.apache.org/attachments/27362054/HiveContrib-Nov13-groovy_plus_hive.pptx). | +| compile `` AS GROOVY NAMED | This allows inline Groovy code to be compiled and be used as a UDF (as of Hive [0.13.0](https://issues.apache.org/jira/browse/HIVE-5252)). For a usage example, see [Nov. 2013 Hive Contributors Meetup Presentations – Using Dynamic Compilation with Hive](/attachments/27362054/HiveContrib-Nov13-groovy_plus_hive.pptx). | | show processlist | Displays information about the operations currently running on HiveServer2. It helps to troubleshoot issues such as long running queries, connection starvation, etc. The command was introduced in [HIVE-27829](https://issues.apache.org/jira/browse/HIVE-27829). | Sample Usage: diff --git a/content/docs/latest/language/languagemanual-ddl.md b/content/docs/latest/language/languagemanual-ddl.md index f064a8a0..f08c198f 100644 --- a/content/docs/latest/language/languagemanual-ddl.md +++ b/content/docs/latest/language/languagemanual-ddl.md @@ -56,7 +56,7 @@ The uses of SCHEMA and DATABASE are interchangeable – they mean the same thing MANAGEDLOCATION was added to database in Hive 4.0.0 ([HIVE-22995](https://issues.apache.org/jira/browse/HIVE-22995)). LOCATION now refers to the default directory for external tables and MANAGEDLOCATION refers to the default directory for managed tables. Its recommended that MANAGEDLOCATION be within metastore.warehouse.dir so all managed tables have a common root where common governance policies. It can be used with metastore.warehouse.tenant.colocation to have it point to a directory outside the warehouse root directory to have a tenant based common root where quotas and other policies can be set.  -REMOTE databases were added in Hive 4.0.0 ([HIVE-24396](https://issues.apache.org/jira/browse/HIVE-24396)) for support for Data connectors. See documentation for [Data connectors](https://hive.apache.org/docs/latest/user/data-connectors-in-hive).  +REMOTE databases were added in Hive 4.0.0 ([HIVE-24396](https://issues.apache.org/jira/browse/HIVE-24396)) for support for Data connectors. See documentation for [Data connectors](/docs/latest/user/data-connectors-in-hive).  ### Drop Database @@ -120,7 +120,7 @@ URL - URL of the remote datasource. In case of JDBC datasource, it would be the COMMENT - A short description for this connector. -DCPROPERTIES: Contains a set of name/value pairs that are set for the connector. The credentials for the remote datasource are specified as part of the DCPROPERTIES as documented in the [JDBC Storage Handler](https://hive.apache.org/docs/latest/user/jdbc-storage-handler) docs. All properties that start with a prefix of "hive.sql" are added to the tables mapped by this connector. +DCPROPERTIES: Contains a set of name/value pairs that are set for the connector. The credentials for the remote datasource are specified as part of the DCPROPERTIES as documented in the [JDBC Storage Handler](/docs/latest/user/jdbc-storage-handler) docs. All properties that start with a prefix of "hive.sql" are added to the tables mapped by this connector. ### Drop Connector @@ -284,7 +284,7 @@ CREATE TABLE creates a table with the given name. An error is thrown if a table * To specify a database for the table, either issue the [USE database_name]({{< ref "#use-database_name" >}}) statement prior to the CREATE TABLE statement (in [Hive 0.6](https://issues.apache.org/jira/browse/HIVE-675) and later) or qualify the table name with a database name ("`database_name.table.name`" in [Hive 0.7](https://issues.apache.org/jira/browse/HIVE-1517) and later). The keyword "`default`" can be used for the default database. -See [Alter Table](https://hive.apache.org/docs/latest/language/languagemanual-ddl#alter-table) below for more information about table comments, table properties, and SerDe properties. +See [Alter Table](/docs/latest/language/languagemanual-ddl#alter-table) below for more information about table comments, table properties, and SerDe properties. See [Type System]({{< ref "#type-system" >}}) and [Hive Data Types]({{< ref "languagemanual-types" >}}) for details about the primitive and complex data types. @@ -322,7 +322,7 @@ Use the SERDE clause to create a table with a custom SerDe. For more information You must specify a list of columns for tables that use a native SerDe. Refer to the [Types]({{< ref "languagemanual-types" >}}) part of the User Guide for the allowable column types. A list of columns for tables that use a custom SerDe may be specified but Hive will query the SerDe to determine the actual list of columns for this table. -For general information about SerDes, see [Hive SerDe](https://hive.apache.org/community/resources/developerguide#hive-serde) in the Developer Guide. Also see [SerDe](https://hive.apache.org/docs/latest/user/serde) for details about input and output processing. +For general information about SerDes, see [Hive SerDe](/community/resources/developerguide#hive-serde) in the Developer Guide. Also see [SerDe](/docs/latest/user/serde) for details about input and output processing. To change a table's SerDe or SERDEPROPERTIES, use the ALTER TABLE statement as described below in [Add SerDe Properties]({{< ref "#add-serde-properties" >}}). @@ -330,7 +330,7 @@ To change a table's SerDe or SERDEPROPERTIES, use the ALTER TABLE statement as d | --- | --- | | **RegEx**ROW FORMAT SERDE'org.apache.hadoop.hive.serde2.RegexSerDe'WITH SERDEPROPERTIES ("input.regex" = "")STORED AS TEXTFILE; | Stored as plain text file, translated by Regular Expression.The following example defines a table in the default Apache Weblog format.`CREATE` `TABLE` `apachelog (``host STRING,``identity STRING,``user` `STRING,``time` `STRING,``request STRING,``status STRING,``size` `STRING,``referer STRING,``agent STRING)``ROW FORMAT SERDE``'org.apache.hadoop.hive.serde2.RegexSerDe'``WITH` `SERDEPROPERTIES (``"input.regex"` `=``"([^]*) ([^]*) ([^]*) (-|\\[^\\]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*) (-|[0-9]*)(?: ([^ \"]*|\".*\") ([^ \"]*|\".*\"))?"``)``STORED``AS` `TEXTFILE;`More about RegexSerDe can be found here in [HIVE-662](https://issues.apache.org/jira/browse/HIVE-662) and [HIVE-1719](https://issues.apache.org/jira/browse/HIVE-1719). | | **JSON** ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' STORED AS TEXTFILE | Stored as plain text file in JSON format.The JsonSerDe for JSON files is available in [Hive 0.12](https://issues.apache.org/jira/browse/HIVE-4895) and later.In some distributions, a reference to hive-hcatalog-core.jar is required.`ADD JAR /usr/lib/hive-hcatalog/lib/hive-hcatalog-core.jar;CREATE` `TABLE` `my_table(a string, b``bigint``, ...)``ROW FORMAT SERDE``'org.apache.hive.hcatalog.data.JsonSerDe'``STORED``AS` `TEXTFILE;`The JsonSerDe was moved to Hive from HCatalog and before it was in hive-contrib project. It was added to the Hive distribution by [HIVE-4895](https://issues.apache.org/jira/browse/HIVE-4895).An Amazon SerDe is available at `s3://elasticmapreduce/samples/hive-ads/libs/jsonserde.jar` for releases prior to 0.12.0.The JsonSerDe for JSON files is available in [Hive 0.12](https://issues.apache.org/jira/browse/HIVE-4895) and later.Starting in Hive 3.0.0, JsonSerDe is added to Hive Serde as "org.apache.hadoop.hive.serde2.JsonSerDe" ([HIVE-19211](https://issues.apache.org/jira/browse/HIVE-19211)).`CREATE` `TABLE` `my_table(a string, b``bigint``, ...)``ROW FORMAT SERDE``'org.apache.hadoop.hive.serde2.JsonSerDe'``STORED``AS` `TEXTFILE;`Or `STORED AS JSONFILE` is supported starting in Hive 4.0.0 ([HIVE-19899](https://issues.apache.org/jira/browse/HIVE-19899)), so you can create table as follows:`CREATE` `TABLE` `my_table(a string, b``bigint``, ...) STORED AS JSONFILE;` | -| **CSV/TSV**ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' STORED AS TEXTFILE | Stored as plain text file in CSV / TSV format. The CSVSerde is available in [Hive 0.14](https://issues.apache.org/jira/browse/HIVE-7777) and greater.The following example creates a TSV (Tab-separated) file.``CREATE` `TABLE` `my_table(a string, b string, ...)`ROW FORMAT SERDE``'org.apache.hadoop.hive.serde2.OpenCSVSerde'``WITH` `SERDEPROPERTIES (``"separatorChar"` `=``"\t"``,``"quoteChar"`     `=``"'"``,``"escapeChar"`    `=``"\\"``)``STORED``AS` `TEXTFILE;`Default properties for SerDe is Comma-Separated (CSV) file `DEFAULT_ESCAPE_CHARACTER \``DEFAULT_QUOTE_CHARACTER  "``DEFAULT_SEPARATOR        ,`This SerDe works for most CSV data, but does not handle embedded newlines. To use the SerDe, specify the fully qualified class name org.apache.hadoop.hive.serde2.OpenCSVSerde.  Documentation is based on original documentation at .**Limitations**This SerDe treats all columns to be of type String. Even if you create a table with non-string column types using this SerDe, the DESCRIBE TABLE output would show string column type. The type information is retrieved from the SerDe. To convert columns to the desired type in a table, you can create a view over the table that does the CAST to the desired type.The CSV SerDe is based on , and was added to the Hive distribution in [HIVE-7777](https://issues.apache.org/jira/browse/HIVE-7777).The CSVSerde has been built and tested against Hive 0.14 and later, and uses [Open-CSV](http://opencsv.sourceforge.net/) 2.3 which is bundled with the Hive distribution.For general information about SerDes, see [Hive SerDe](https://hive.apache.org/community/resources/developerguide#hive-serde) in the Developer Guide. Also see [SerDe](https://hive.apache.org/docs/latest/user/serde) for details about input and output processing. | +| **CSV/TSV**ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' STORED AS TEXTFILE | Stored as plain text file in CSV / TSV format. The CSVSerde is available in [Hive 0.14](https://issues.apache.org/jira/browse/HIVE-7777) and greater.The following example creates a TSV (Tab-separated) file.``CREATE` `TABLE` `my_table(a string, b string, ...)`ROW FORMAT SERDE``'org.apache.hadoop.hive.serde2.OpenCSVSerde'``WITH` `SERDEPROPERTIES (``"separatorChar"` `=``"\t"``,``"quoteChar"`     `=``"'"``,``"escapeChar"`    `=``"\\"``)``STORED``AS` `TEXTFILE;`Default properties for SerDe is Comma-Separated (CSV) file `DEFAULT_ESCAPE_CHARACTER \``DEFAULT_QUOTE_CHARACTER  "``DEFAULT_SEPARATOR        ,`This SerDe works for most CSV data, but does not handle embedded newlines. To use the SerDe, specify the fully qualified class name org.apache.hadoop.hive.serde2.OpenCSVSerde.  Documentation is based on original documentation at .**Limitations**This SerDe treats all columns to be of type String. Even if you create a table with non-string column types using this SerDe, the DESCRIBE TABLE output would show string column type. The type information is retrieved from the SerDe. To convert columns to the desired type in a table, you can create a view over the table that does the CAST to the desired type.The CSV SerDe is based on , and was added to the Hive distribution in [HIVE-7777](https://issues.apache.org/jira/browse/HIVE-7777).The CSVSerde has been built and tested against Hive 0.14 and later, and uses [Open-CSV](http://opencsv.sourceforge.net/) 2.3 which is bundled with the Hive distribution.For general information about SerDes, see [Hive SerDe](/community/resources/developerguide#hive-serde) in the Developer Guide. Also see [SerDe](/docs/latest/user/serde) for details about input and output processing. | #### Partitioned Tables @@ -563,7 +563,7 @@ Version information As of Hive 4.0 ([HIVE-18453](https://issues.apache.org/jira/browse/HIVE-18453)). -A table that supports operations with ACID semantics. See [this](https://hive.apache.org/docs/latest/user/hive-transactions) for more details about transactional tables. +A table that supports operations with ACID semantics. See [this](/docs/latest/user/hive-transactions) for more details about transactional tables. **Example:** @@ -1098,7 +1098,7 @@ If any partition in a table has NO_DROP enabled, the table cannot be dropped eit Version information In Hive release [0.13.0](https://issues.apache.org/jira/browse/HIVE-5317) and later when [transactions]({{< ref "hive-transactions" >}}) are being used, the ALTER TABLE statement can request [compaction]({{< ref "#compaction" >}}) of a table or partition. -As of Hive release [1.3.0 and 2.1.0](https://issues.apache.org/jira/browse/HIVE-13354) when [transactions]({{< ref "hive-transactions" >}}) are being used, the ALTER TABLE ... COMPACT statement can include a [TBLPROPERTIES](https://hive.apache.org/docs/latest/user/hive-transactions#table-properties) clause that is either to change compaction MapReduce job properties or to overwrite any other Hive table properties. More details can be found [here](https://hive.apache.org/docs/latest/user/hive-transactions#table-properties). +As of Hive release [1.3.0 and 2.1.0](https://issues.apache.org/jira/browse/HIVE-13354) when [transactions]({{< ref "hive-transactions" >}}) are being used, the ALTER TABLE ... COMPACT statement can include a [TBLPROPERTIES](/docs/latest/user/hive-transactions#table-properties) clause that is either to change compaction MapReduce job properties or to overwrite any other Hive table properties. More details can be found [here](/docs/latest/user/hive-transactions#table-properties). As of Hive release [4.0.0-alpha-2](https://issues.apache.org/jira/browse/HIVE-27056?jql=project%20%3D%20HIVE%20AND%20fixVersion%20%3D%204.0.0-alpha-2) [compaction pooling]({{< ref "compaction-pooling" >}}) is available. As of Hive release [4.0.0](https://issues.apache.org/jira/browse/HIVE-27094?jql=project%20%3D%20HIVE%20AND%20fixVersion%20%3D%204.0.0) [rebalance compaction]({{< ref "rebalance-compaction" >}}) is available. @@ -1286,11 +1286,11 @@ Note that a view is a purely logical object with no associated storage. When a q A view's schema is frozen at the time the view is created; subsequent changes to underlying tables (e.g. adding a column) will not be reflected in the view's schema. If an underlying table is dropped or changed in an incompatible fashion, subsequent attempts to query the invalid view will fail. -Views are read-only and may not be used as the target of LOAD/INSERT/ALTER. For changing metadata, see [ALTER VIEW](https://hive.apache.org/docs/latest/language/languagemanual-ddl#alter-view-properties). +Views are read-only and may not be used as the target of LOAD/INSERT/ALTER. For changing metadata, see [ALTER VIEW](/docs/latest/language/languagemanual-ddl#alter-view-properties). A view may contain ORDER BY and LIMIT clauses. If a referencing query also contains these clauses, the query-level clauses are evaluated **after** the view clauses (and after any other operations in the query). For example, if a view specifies LIMIT 5, and a referencing query is executed as (select * from v LIMIT 10), then at most 5 rows will be returned. -Starting with [Hive 0.13.0](https://issues.apache.org/jira/browse/HIVE-1180), the view's select statement can include one or more common table expressions (CTEs) as shown in the [SELECT syntax](https://hive.apache.org/docs/latest/language/languagemanual-select#select-syntax). For examples of CTEs in CREATE VIEW statements, see [Common Table Expression](https://hive.apache.org/docs/latest/language/common-table-expression#cte-in-views-ctas-and-insert-statements). +Starting with [Hive 0.13.0](https://issues.apache.org/jira/browse/HIVE-1180), the view's select statement can include one or more common table expressions (CTEs) as shown in the [SELECT syntax](/docs/latest/language/languagemanual-select#select-syntax). For examples of CTEs in CREATE VIEW statements, see [Common Table Expression](/docs/latest/language/common-table-expression#cte-in-views-ctas-and-insert-statements). **Example:** @@ -2004,7 +2004,7 @@ As of [Hive 0.13.0](https://issues.apache.org/jira/browse/HIVE-6460) (see [Hive SHOW COMPACTIONS [DATABASE.][TABLE] [PARTITION ()] [POOL_NAME] [TYPE] [STATE] [ORDER BY `start` DESC] [LIMIT 10]; ``` -[SHOW COMPACTIONS](https://hive.apache.org/docs/latest/user/hive-transactions#show-compactions) returns a list of all compaction requests currently being [processed]({{< ref "#processed" >}}) or scheduled, including this information: +[SHOW COMPACTIONS](/docs/latest/user/hive-transactions#show-compactions) returns a list of all compaction requests currently being [processed]({{< ref "#processed" >}}) or scheduled, including this information: * "CompactionId" - unique internal id (As of [Hive 3.0](https://issues.apache.org/jira/browse/HIVE-16084)) * "Database" - Hive database name diff --git a/content/docs/latest/language/languagemanual-dml.md b/content/docs/latest/language/languagemanual-dml.md index a29ab933..6fc6fdca 100644 --- a/content/docs/latest/language/languagemanual-dml.md +++ b/content/docs/latest/language/languagemanual-dml.md @@ -118,7 +118,7 @@ INSERT INTO TABLE tablename PARTITION (partcol1[=val1], partcol2[=val2] ...) sel ##### Notes * Multi Table Inserts minimize the number of data scans required. Hive can insert data into multiple tables by scanning the input data just once (and applying different query operators) to the input data. -* Starting with [Hive 0.13.0](https://issues.apache.org/jira/browse/HIVE-1180), the select statement can include one or more common table expressions (CTEs) as shown in the [SELECT syntax](https://hive.apache.org/docs/latest/language/languagemanual-select#select-syntax). For an example, see [Common Table Expression](https://hive.apache.org/docs/latest/language/common-table-expression#cte-in-views-ctas-and-insert-statements). +* Starting with [Hive 0.13.0](https://issues.apache.org/jira/browse/HIVE-1180), the select statement can include one or more common table expressions (CTEs) as shown in the [SELECT syntax](/docs/latest/language/languagemanual-select#select-syntax). For an example, see [Common Table Expression](/docs/latest/language/common-table-expression#cte-in-views-ctas-and-insert-statements). ##### Dynamic Partition Inserts @@ -221,7 +221,7 @@ where a value is either null or any valid SQL literal * Each row listed in the VALUES clause is inserted into table *tablename*. * Values must be provided for every column in the table. The standard SQL syntax that allows the user to insert values into only some columns is not yet supported. To mimic the standard SQL, nulls can be provided for columns the user does not wish to assign a value to. -* Dynamic partitioning is supported in the same way as for [INSERT...SELECT](https://hive.apache.org/docs/latest/language/languagemanual-dml#dynamic-partition-inserts). +* Dynamic partitioning is supported in the same way as for [INSERT...SELECT](/docs/latest/language/languagemanual-dml#dynamic-partition-inserts). * If the table being inserted into supports [ACID]({{< ref "hive-transactions" >}}) and a transaction manager that supports ACID is in use, this operation will be auto-committed upon successful completion. * Hive does not support literals for complex types (array, map, struct, union), so it is not possible to use them in INSERT INTO...VALUES clauses. This means that the user cannot insert data into a complex datatype column using the INSERT INTO...VALUES clause. diff --git a/content/docs/latest/language/languagemanual-indexing.md b/content/docs/latest/language/languagemanual-indexing.md index e658ddc8..b9e9ff98 100644 --- a/content/docs/latest/language/languagemanual-indexing.md +++ b/content/docs/latest/language/languagemanual-indexing.md @@ -28,10 +28,10 @@ Hive indexing was added in version 0.7.0, and bitmap indexing was added in versi Documentation and examples of how to use Hive indexes can be found here: -* [Indexes](https://hive.apache.org/development/desingdocs/indexdev) – design document (lists indexing JIRAs with current status, starting with [HIVE-417](https://issues.apache.org/jira/browse/HIVE-417)) -* [Create/Drop/Alter Index]({{< ref "#create/drop/alter-index" >}}) – [HiveQL Language Manual DDL](https://hive.apache.org/docs/latest/language/languagemanual-ddl) -* [Show Indexes](https://hive.apache.org/docs/latest/language/languagemanual-ddl#show-indexes) – [HiveQL Language Manual DDL](https://hive.apache.org/docs/latest/language/languagemanual-ddl) -* [Bitmap indexes](https://hive.apache.org/development/desingdocs/indexdev-bitmap) – added in Hive version 0.8.0 ([HIVE-1803](https://issues.apache.org/jira/browse/HIVE-1803)) +* [Indexes](/development/desingdocs/indexdev) – design document (lists indexing JIRAs with current status, starting with [HIVE-417](https://issues.apache.org/jira/browse/HIVE-417)) +* [Create/Drop/Alter Index]({{< ref "#create/drop/alter-index" >}}) – [HiveQL Language Manual DDL](/docs/latest/language/languagemanual-ddl) +* [Show Indexes](/docs/latest/language/languagemanual-ddl#show-indexes) – [HiveQL Language Manual DDL](/docs/latest/language/languagemanual-ddl) +* [Bitmap indexes](/development/desingdocs/indexdev-bitmap) – added in Hive version 0.8.0 ([HIVE-1803](https://issues.apache.org/jira/browse/HIVE-1803)) * [Indexed Hive](http://www.slideshare.net/NikhilDeshpande/indexed-hive) – overview and examples by Prafulla Tekawade and Nikhil Deshpande, October 2010 * [Tutorial: SQL-like join and index with MapReduce using Hadoop and Hive](http://asheeshgarg.blogspot.com/2012/04/sql-like-join-and-index-with-mr-using.html) – blog by Ashish Garg, April 2012 diff --git a/content/docs/latest/language/languagemanual-joins.md b/content/docs/latest/language/languagemanual-joins.md index 58ddb30a..af65bf2a 100644 --- a/content/docs/latest/language/languagemanual-joins.md +++ b/content/docs/latest/language/languagemanual-joins.md @@ -272,7 +272,7 @@ See [Hive Outer Join Behavior]({{< ref "outerjoinbehavior" >}}) for information ### Enhancements in Hive Version 0.11 -See [Join Optimization](https://hive.apache.org/docs/latest/language/languagemanual-joinoptimization) for information about enhancements to join optimization introduced in Hive version 0.11.0. The use of hints is de-emphasized in the enhanced optimizations ([HIVE-3784](https://issues.apache.org/jira/browse/HIVE-3784) and related JIRAs). +See [Join Optimization](/docs/latest/language/languagemanual-joinoptimization) for information about enhancements to join optimization introduced in Hive version 0.11.0. The use of hints is de-emphasized in the enhanced optimizations ([HIVE-3784](https://issues.apache.org/jira/browse/HIVE-3784) and related JIRAs). diff --git a/content/docs/latest/language/languagemanual-sortby.md b/content/docs/latest/language/languagemanual-sortby.md index 70a38ffd..863b3514 100644 --- a/content/docs/latest/language/languagemanual-sortby.md +++ b/content/docs/latest/language/languagemanual-sortby.md @@ -25,8 +25,8 @@ There are some limitations in the "order by" clause. In the strict mode (i.e., [ Note that columns are specified by name, not by position number. However in [Hive 0.11.0](https://issues.apache.org/jira/browse/HIVE-581) and later, columns can be specified by position when configured as follows: -* For Hive 0.11.0 through 2.1.x, set [hive.groupby.orderby.position.alias](https://hive.apache.org/docs/latest/user/configuration-properties#hivegroupbyorderbypositionalias) to true (the default is false). -* For Hive 2.2.0 and later, [hive.orderby.position.alias](https://hive.apache.org/docs/latest/user/configuration-properties#hiveorderbypositionalias) is true by default. +* For Hive 0.11.0 through 2.1.x, set [hive.groupby.orderby.position.alias](/docs/latest/user/configuration-properties#hivegroupbyorderbypositionalias) to true (the default is false). +* For Hive 2.2.0 and later, [hive.orderby.position.alias](/docs/latest/user/configuration-properties#hiveorderbypositionalias) is true by default. The default sorting order is ascending (ASC). @@ -185,8 +185,8 @@ SELECT col1, col2 FROM t1 DISTRIBUTE BY col1 SORT BY col1 ASC, col2 DESC Note that columns are specified by name, not by position number. However in [HIVE-28572](https://issues.apache.org/jira/browse/HIVE-28572) and later, columns can be specified by position when configured as follows: -* set [hive.orderby.position.alias](https://hive.apache.org/docs/latest/configuration-properties/#hiveorderbypositionalias)=true; -* set [hive.cbo.enable](https://hive.apache.org/docs/latest/configuration-properties/#hivecboenable)=true; +* set [hive.orderby.position.alias](/docs/latest/user/configuration-properties/#hiveorderbypositionalias)=true; +* set [hive.cbo.enable](/docs/latest/user/configuration-properties/#hivecboenable)=true; When any of the above conditions are not met, no distribution is performed. diff --git a/content/docs/latest/language/languagemanual-types.md b/content/docs/latest/language/languagemanual-types.md index 8f6b6c5f..d4c053d1 100644 --- a/content/docs/latest/language/languagemanual-types.md +++ b/content/docs/latest/language/languagemanual-types.md @@ -17,33 +17,29 @@ For data types supported by HCatalog, see: ### Numeric Types -* [`TINYINT`](https://hive.apache.org/docs/latest/language/languagemanual-types#integral-types-tinyintsmallintintintegerbigint) (1-byte signed integer, from `-128` to `127`) -* [`SMALLINT`](https://hive.apache.org/docs/latest/language/languagemanual-types#integral-types-tinyintsmallintintintegerbigint) (2-byte signed integer, from `-32,768` to `32,767`) -* ``` -[INT](https://hive.apache.org/docs/latest/language/languagemanual-types#integral-types-tinyintsmallintintintegerbigint)/INTEGER (4-byte signed integer, from -2,147,483,648 to 2,147,483,647) -``` -* [`BIGINT`](https://hive.apache.org/docs/latest/language/languagemanual-types#integral-types-tinyintsmallintintintegerbigint) (8-byte signed integer, from `-9,223,372,036,854,775,808` to `9,223,372,036,854,775,807`) +* [`TINYINT`](/docs/latest/language/languagemanual-types#integral-types-tinyintsmallintintintegerbigint) (1-byte signed integer, from `-128` to `127`) +* [`SMALLINT`](/docs/latest/language/languagemanual-types#integral-types-tinyintsmallintintintegerbigint) (2-byte signed integer, from `-32,768` to `32,767`) +* [INT](/docs/latest/language/languagemanual-types#integral-types-tinyintsmallintintintegerbigint)/INTEGER (4-byte signed integer, from -2,147,483,648 to 2,147,483,647) +* [`BIGINT`](/docs/latest/language/languagemanual-types#integral-types-tinyintsmallintintintegerbigint) (8-byte signed integer, from `-9,223,372,036,854,775,808` to `9,223,372,036,854,775,807`) * `FLOAT` (4-byte single precision floating point number) * `DOUBLE` (8-byte double precision floating point number) -* ``` -DOUBLE PRECISION (alias for DOUBLE, only available starting with Hive [2.2.0](https://issues.apache.org/jira/browse/HIVE-13556)) -``` -* [`DECIMAL`](https://hive.apache.org/docs/latest/language/languagemanual-types#decimals) +* DOUBLE PRECISION (alias for DOUBLE, only available starting with Hive [2.2.0](https://issues.apache.org/jira/browse/HIVE-13556)) +* [`DECIMAL`](/docs/latest/language/languagemanual-types#decimals) + Introduced in Hive [0.11.0](https://issues.apache.org/jira/browse/HIVE-2693) with a precision of 38 digits + Hive [0.13.0](https://issues.apache.org/jira/browse/HIVE-3976) introduced user-definable precision and scale * `NUMERIC` (same as `DECIMAL`, starting with [Hive 3.0.0](https://issues.apache.org/jira/browse/HIVE-16764)) ### Date/Time Types -* [`TIMESTAMP`](https://hive.apache.org/docs/latest/language/languagemanual-types#timestamps) (Note: Only available starting with Hive [0.8.0](https://issues.apache.org/jira/browse/HIVE-2272)) -* [`DATE`](https://hive.apache.org/docs/latest/language/languagemanual-types#dates) (Note: Only available starting with Hive [0.12.0](https://issues.apache.org/jira/browse/HIVE-4055)) -* [`INTERVAL`](https://hive.apache.org/docs/latest/language/languagemanual-types#intervals) (Note: Only available starting with Hive [1.2.0](https://issues.apache.org/jira/browse/HIVE-9792)) +* [`TIMESTAMP`](/docs/latest/language/languagemanual-types#timestamps) (Note: Only available starting with Hive [0.8.0](https://issues.apache.org/jira/browse/HIVE-2272)) +* [`DATE`](/docs/latest/language/languagemanual-types#dates) (Note: Only available starting with Hive [0.12.0](https://issues.apache.org/jira/browse/HIVE-4055)) +* [`INTERVAL`](/docs/latest/language/languagemanual-types#intervals) (Note: Only available starting with Hive [1.2.0](https://issues.apache.org/jira/browse/HIVE-9792)) ### String Types -* [`STRING`](https://hive.apache.org/docs/latest/language/languagemanual-types#strings) -* [`VARCHAR`](https://hive.apache.org/docs/latest/language/languagemanual-types#varchar) (Note: Only available starting with Hive [0.12.0](https://issues.apache.org/jira/browse/HIVE-4844)) -* [`CHAR`](https://hive.apache.org/docs/latest/language/languagemanual-types#char) (Note: Only available starting with Hive [0.13.0](https://issues.apache.org/jira/browse/HIVE-5191)) +* [`STRING`](/docs/latest/language/languagemanual-types#strings) +* [`VARCHAR`](/docs/latest/language/languagemanual-types#varchar) (Note: Only available starting with Hive [0.12.0](https://issues.apache.org/jira/browse/HIVE-4844)) +* [`CHAR`](/docs/latest/language/languagemanual-types#char) (Note: Only available starting with Hive [0.13.0](https://issues.apache.org/jira/browse/HIVE-5191)) ### Misc Types @@ -185,7 +181,7 @@ CREATE TABLE foo ( ``` -For usage, see [LanguageManual Types#Floating Point Types](https://hive.apache.org/docs/latest/language/languagemanual-types#floating-point-types) in the Literals section below. +For usage, see [LanguageManual Types#Floating Point Types](/docs/latest/language/languagemanual-types#floating-point-types) in the Literals section below. #### Decimal Literals @@ -241,7 +237,7 @@ ALTER TABLE foo PARTITION (ds='2008-04-08', hr=12) CHANGE COLUMN dec_column_name ... ``` -The Decimal datatype is discussed further in [Floating Point Types](https://hive.apache.org/docs/latest/language/languagemanual-types#floating-point-types) below. +The Decimal datatype is discussed further in [Floating Point Types](/docs/latest/language/languagemanual-types#floating-point-types) below. ### Union Types @@ -288,7 +284,7 @@ Floating point literals are assumed to be DOUBLE. Scientific notation is not yet Version -Decimal datatype was introduced in Hive 0.11.0 ([HIVE-2693](https://issues.apache.org/jira/browse/HIVE-2693)). See [Decimal Datatype](https://hive.apache.org/docs/latest/language/languagemanual-types#decimals) above. +Decimal datatype was introduced in Hive 0.11.0 ([HIVE-2693](https://issues.apache.org/jira/browse/HIVE-2693)). See [Decimal Datatype](/docs/latest/language/languagemanual-types#decimals) above. `NUMERIC` is the same as `DECIMAL` as of Hive 3.0.0 ([HIVE-16764](https://issues.apache.org/jira/browse/HIVE-16764)). @@ -380,7 +376,7 @@ Two new tests have been added as part of the TestCliDriver framework within Hive More tests need to be added that demonstrate failure or when certain types of casts are prevented (for example, casting to date). There is some ambiguity in the round function because the rounding of Decimal does not work exactly as the SQL standard, and therefore it has been omitted in the current work. -For general information about running Hive tests, see [How to Contribute to Apache Hive](https://hive.apache.org/community/resources/howtocontribute) and [Hive Developer FAQ](https://hive.apache.org/community/resources/hivedeveloperfaq). +For general information about running Hive tests, see [How to Contribute to Apache Hive](/community/resources/howtocontribute) and [Hive Developer FAQ](/community/resources/hivedeveloperfaq). ## Handling of NULL Values diff --git a/content/docs/latest/language/languagemanual-udf.md b/content/docs/latest/language/languagemanual-udf.md index cdce416c..92341edf 100644 --- a/content/docs/latest/language/languagemanual-udf.md +++ b/content/docs/latest/language/languagemanual-udf.md @@ -180,7 +180,7 @@ Version The decimal datatype was introduced in Hive 0.11.0 ([HIVE-2693](https://issues.apache.org/jira/browse/HIVE-2693)). -All regular arithmetic operators (such as +, -, *, /) and relevant mathematical UDFs (Floor, Ceil, Round, and many more) have been updated to handle decimal types. For a list of supported UDFs, see [Mathematical UDFs](https://hive.apache.org/docs/latest/language/languagemanual-types#mathematical-udfs) in [Hive Data Types](https://hive.apache.org/docs/latest/language/languagemanual-types). +All regular arithmetic operators (such as +, -, *, /) and relevant mathematical UDFs (Floor, Ceil, Round, and many more) have been updated to handle decimal types. For a list of supported UDFs, see [Mathematical UDFs](/docs/latest/language/languagemanual-types#mathematical-udfs) in [Hive Data Types](/docs/latest/language/languagemanual-types). ### Collection Functions diff --git a/content/docs/latest/language/languagemanual.md b/content/docs/latest/language/languagemanual.md index e835762a..cf35e5b3 100644 --- a/content/docs/latest/language/languagemanual.md +++ b/content/docs/latest/language/languagemanual.md @@ -48,7 +48,7 @@ This is the Hive Language Manual.  For other Hive documentation, see the Hive w + [Virtual Columns]({{< ref "languagemanual-virtualcolumns" >}}) + [Windowing and Analytics Functions]({{< ref "languagemanual-windowingandanalytics" >}}) + [Enhanced Aggregation, Cube, Grouping and Rollup]({{< ref "enhanced-aggregation-cube-grouping-and-rollup" >}}) - + Procedural Language:  [Hive HPL/SQL](https://hive.apache.org/docs/latest/user/hive-hpl-sql) + + Procedural Language:  [Hive HPL/SQL](/docs/latest/user/hive-hpl-sql) + [Explain Execution Plan]({{< ref "languagemanual-explain" >}}) * [Locks]({{< ref "locking" >}}) * [Authorization]({{< ref "languagemanual-authorization" >}}) diff --git a/content/docs/latest/language/materialized-views.md b/content/docs/latest/language/materialized-views.md index 3fac44a7..efce227c 100644 --- a/content/docs/latest/language/materialized-views.md +++ b/content/docs/latest/language/materialized-views.md @@ -25,7 +25,7 @@ In this section, we present the main operations that are currently present in Hi ### Materialized views creation -The syntax to create a materialized view in Hive is very similar to the [CTAS statement](https://hive.apache.org/docs/latest/language/languagemanual-ddl#create-table-as-select-ctas) syntax, supporting common features such as partition columns, custom storage handler, or passing table properties. +The syntax to create a materialized view in Hive is very similar to the [CTAS statement](/docs/latest/language/languagemanual-ddl#create-table-as-select-ctas) syntax, supporting common features such as partition columns, custom storage handler, or passing table properties. ``` CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db_name.]materialized_view_name @@ -50,7 +50,7 @@ By default, materialized views are usable for query rewriting by the optimizer, The default values for SerDe and storage format when they are not specified in the materialized view creation statement (they are optional) are specified using the configuration properties `hive.materializedview.serde` and `hive.materializedview.fileformat`, respectively. -Materialized views can be stored in external systems, e.g., [Druid](https://hive.apache.org/docs/latest/user/druid-integration), using custom storage handlers. For instance, the following statement creates a materialized view that is stored in Druid: +Materialized views can be stored in external systems, e.g., [Druid](/docs/latest/user/druid-integration), using custom storage handlers. For instance, the following statement creates a materialized view that is stored in Druid: **Example:** diff --git a/content/docs/latest/language/scheduled-queries.md b/content/docs/latest/language/scheduled-queries.md index 9ff2ddcf..7c16389e 100644 --- a/content/docs/latest/language/scheduled-queries.md +++ b/content/docs/latest/language/scheduled-queries.md @@ -32,16 +32,21 @@ Hive has it’s scheduled query interface built into the language itself for eas ## Create Scheduled query syntax -**CREATE SCHEDULED QUERY ** -**[](https://hive.apache.org/docs/latest/language/scheduled-queries#schedulespecification-syntax)** -**[[](https://hive.apache.org/docs/latest/language/scheduled-queries#executedas-syntax)** **]** -**[[](https://hive.apache.org/docs/latest/language/scheduled-queries#enablespecification-syntax)]** -**[](https://hive.apache.org/docs/latest/language/scheduled-queries#defined-as-syntax)** +**CREATE SCHEDULED QUERY +[``](/docs/latest/language/scheduled-queries#schedulespecification-syntax) +[[``](/docs/latest/language/scheduled-queries#executedas-syntax)] +[[``](/docs/latest/language/scheduled-queries#enablespecification-syntax)] +[``](/docs/latest/language/scheduled-queries#defined-as-syntax)** ## Alter Scheduled query syntax -**ALTER SCHEDULED QUERY ** **([](https://hive.apache.org/docs/latest/language/scheduled-queries#schedulespecification-syntax)|[](https://hive.apache.org/docs/latest/language/scheduled-queries#executedas-syntax)|[](https://hive.apache.org/docs/latest/language/scheduled-queries#enablespecification-syntax)|[](https://hive.apache.org/docs/latest/language/scheduled-queries#defined-as-syntax)|[](https://hive.apache.org/docs/latest/language/scheduled-queries#executespec-syntax));** +**ALTER SCHEDULED QUERY ( +[``](/docs/latest/language/scheduled-queries#schedulespecification-syntax)| +[``](/docs/latest/language/scheduled-queries#executedas-syntax)| +[``](/docs/latest/language/scheduled-queries#enablespecification-syntax)| +[``](/docs/latest/language/scheduled-queries#defined-as-syntax)| +[``](/docs/latest/language/scheduled-queries#executespec-syntax));** ## Drop syntax diff --git a/content/docs/latest/language/sql-standard-based-hive-authorization.md b/content/docs/latest/language/sql-standard-based-hive-authorization.md index 48cf3751..9792209c 100644 --- a/content/docs/latest/language/sql-standard-based-hive-authorization.md +++ b/content/docs/latest/language/sql-standard-based-hive-authorization.md @@ -17,7 +17,7 @@ The default authorization model in Hive can be used to provide fine grained acce The SQL standards based authorization option (introduced in Hive 0.13) provides a third option for authorization in Hive. This is recommended because it allows Hive to be fully SQL compliant in its authorization model without causing backward compatibility issues for current users. As users migrate to this more secure model, the current default authorization could be deprecated. -For an overview of this authorization option, see [SQL Standards Based Authorization in HiveServer2](https://hive.apache.org/docs/latest/language/languagemanual-authorization#2-sql-standards-based-authorization-in-hiveserver2). +For an overview of this authorization option, see [SQL Standards Based Authorization in HiveServer2](/docs/latest/language/languagemanual-authorization#2-sql-standards-based-authorization-in-hiveserver2). This authorization mode can be used in conjunction with storage based authorization on the metastore server. Like the current default authorization in Hive, this will also be enforced at query compilation time. To provide security through this option, the client will have to be secured. This can be done by allowing users access only through Hive Server2, and by restricting the user code and non-SQL commands that can be run. The checks will happen against the user who submits the request, but the query will run as the Hive server user. The directories and files for input data would have read access for this Hive server user. For users who don’t have the need to protect against malicious users, this could potentially be supported through the Hive command line as well. @@ -35,7 +35,7 @@ The set commands used to change Hive configuration are restricted to a smaller s Privileges to add or drop functions and macros are restricted to the **admin** role. -To enable users to use functions, the ability to create [permanent functions](https://hive.apache.org/docs/latest/language/languagemanual-ddl#create-function) has been added. A user in the **admin** role can run commands to create these functions, which all users can then use. +To enable users to use functions, the ability to create [permanent functions](/docs/latest/language/languagemanual-ddl#create-function) has been added. A user in the **admin** role can run commands to create these functions, which all users can then use. The Hive [transform clause]({{< ref "languagemanual-transform" >}}) is also disabled when this authorization is enabled. @@ -86,11 +86,11 @@ User names are *case sensitive*. This is because, unlike role names, user names #### Quoted Identifiers -User and role names may optionally be surrounded by backtick characters (`) when the configuration parameter `[hive.support.quoted.identifiers](https://hive.apache.org/docs/latest/user/configuration-properties#hivesupportquotedidentifiers)` is set to `column` (default value). All [Unicode](http://en.wikipedia.org/wiki/List_of_Unicode_characters) characters are permitted in the quoted identifiers, with double backticks (``) representing a backtick character. However when `[hive.support.quoted.identifiers](https://hive.apache.org/docs/latest/user/configuration-properties#hivesupportquotedidentifiers)` is set to `none`, only alphanumeric and underscore characters are permitted in user names and role names. +User and role names may optionally be surrounded by backtick characters (\`) when the configuration parameter [`hive.support.quoted.identifiers`](/docs/latest/user/configuration-properties#hivesupportquotedidentifiers) is set to `column` (default value). All [Unicode](http://en.wikipedia.org/wiki/List_of_Unicode_characters) characters are permitted in the quoted identifiers, with double backticks (\`\`) representing a backtick character. However when [`hive.support.quoted.identifiers`](/docs/latest/user/configuration-properties#hivesupportquotedidentifiers) is set to `none`, only alphanumeric and underscore characters are permitted in user names and role names. For details, see [HIVE-6013](https://issues.apache.org/jira/browse/HIVE-6013) and [Supporting Quoted Identifiers in Column Names](https://issues.apache.org/jira/secure/attachment/12618321/QuotedIdentifier.html). -As of [Hive 0.14](https://issues.apache.org/jira/browse/HIVE-8083), user may be optionally surrounded by backtick characters (`) irrespective of the `[hive.support.quoted.identifiers](https://hive.apache.org/docs/latest/user/configuration-properties#hivesupportquotedidentifiers)` setting. +As of [Hive 0.14](https://issues.apache.org/jira/browse/HIVE-8083), user may be optionally surrounded by backtick characters (\`) irrespective of the [`hive.support.quoted.identifiers`](/docs/latest/user/configuration-properties#hivesupportquotedidentifiers) setting. ### **Role Management Commands** diff --git a/content/docs/latest/overview-of-major-changes.md b/content/docs/latest/overview-of-major-changes.md index 46ab13a4..546eba9f 100644 --- a/content/docs/latest/overview-of-major-changes.md +++ b/content/docs/latest/overview-of-major-changes.md @@ -31,8 +31,8 @@ date: 2024-12-12 * ### **Compaction** - + [Rebalance compaction](https://hive.apache.org/docs/latest/user/rebalance-compaction) (Hive ACID) - + Compaction requests prioritization ([compaction pooling](https://hive.apache.org/docs/latest/language/compaction-pooling)) + + [Rebalance compaction](/docs/latest/user/rebalance-compaction) (Hive ACID) + + Compaction requests prioritization ([compaction pooling](/docs/latest/language/compaction-pooling)) + Iceberg compaction (Major) @@ -51,7 +51,7 @@ date: 2024-12-12 * ### HiveServer2 - + Support [SAML 2.0](https://hive.apache.org/development/desingdocs/support-saml-2-0-authentication-mode)/JWT authentication mode + + Support [SAML 2.0](/development/desingdocs/support-saml-2-0-authentication-mode)/JWT authentication mode + Support both Kerberos and LDAP auth methods in parallel + Graceful shutdown + Easy access to the operation log through web UI diff --git a/content/docs/latest/user/Hive-Transactions-ACID.md b/content/docs/latest/user/Hive-Transactions-ACID.md index 4a0c0a1d..438b1b02 100644 --- a/content/docs/latest/user/Hive-Transactions-ACID.md +++ b/content/docs/latest/user/Hive-Transactions-ACID.md @@ -14,7 +14,7 @@ Transactions with ACID semantics have been added to Hive to address the followin 1. Streaming ingest of data.  Many users have tools such as [Apache Flume](http://flume.apache.org/), [Apache Storm](https://storm.incubator.apache.org/), or [Apache Kafka](http://kafka.apache.org/) that they use to stream data into their Hadoop cluster.  While these tools can write data at rates of hundreds or more rows per second, Hive can only add partitions every fifteen minutes to an hour.  Adding partitions more often leads quickly to an overwhelming number of partitions in the table.  These tools could stream data into existing partitions, but this would cause readers to get dirty reads (that is, they would see data written after they had started their queries) and leave many small files in their directories that would put pressure on the NameNode.  With this new functionality this use case will be supported while allowing readers to get a consistent view of the data and avoiding too many files. 2. Slow changing dimensions.  In a typical star schema data warehouse, dimensions tables change slowly over time.  For example, a retailer will open new stores, which need to be added to the stores table, or an existing store may change its square footage or some other tracked characteristic.  These changes lead to inserts of individual records or updates of records (depending on the strategy chosen). 3. Data restatement.  Sometimes collected data is found to be incorrect and needs correction.  Or the first instance of the data may be an approximation (90% of servers reporting) with the full data provided later.  Or business rules may require that certain transactions be restated due to subsequent transactions (e.g., after making a purchase a customer may purchase a membership and thus be entitled to discount prices, including on the previous purchase).  Or a user may be contractually required to remove their customer’s data upon termination of their relationship. -4. Bulk updates using [SQL MERGE](https://hive.apache.org/docs/latest/language/languagemanual-dml#merge) statement. +4. Bulk updates using [SQL MERGE](/docs/latest/language/languagemanual-dml#merge) statement. ## Limitations @@ -26,14 +26,14 @@ Transactions with ACID semantics have been added to Hive to address the followin * At this time only snapshot level isolation is supported.  When a given query starts it will be provided with a consistent snapshot of the data.  There is no support for dirty read, read committed, repeatable read, or serializable.  With the introduction of BEGIN the intention is to support snapshot isolation for the duration of transaction rather than just a single query.  Other isolation levels may be added depending on user requests. * The existing ZooKeeper and in-memory lock managers are not compatible with transactions.  There is no intention to address this issue.  See [Basic Design]({{< ref "#basic-design" >}}) below for a discussion of how locks are stored for transactions. * Using Oracle as the Metastore DB and "datanucleus.connectionPoolingType=BONECP" may generate intermittent "No such lock.." and "No such transaction..." errors.  Setting "datanucleus.connectionPoolingType=DBCP" is recommended in this case. -* [LOAD DATA...](https://hive.apache.org/docs/latest/language/languagemanual-dml#loading-files-into-tables) statement is not supported with transactional tables.  (This was not properly enforced until [HIVE-16732](https://issues.apache.org/jira/browse/HIVE-16732)) +* [LOAD DATA...](/docs/latest/language/languagemanual-dml#loading-files-into-tables) statement is not supported with transactional tables.  (This was not properly enforced until [HIVE-16732](https://issues.apache.org/jira/browse/HIVE-16732)) ## Streaming APIs Hive offers APIs for streaming data ingest and streaming mutation: * [Hive HCatalog Streaming API]({{< ref "streaming-data-ingest" >}}) -* [Hive Streaming API](https://hive.apache.org/docs/latest/user/streaming-data-ingest-v2) (Since Hive 3) +* [Hive Streaming API](/docs/latest/user/streaming-data-ingest-v2) (Since Hive 3) * [HCatalog Streaming Mutation API (Copy)]({{< ref "HCatalog-Streaming-Mutation-API" >}}) (available in Hive 2.0.0 and later) A comparison of these two APIs is available in the [Background]({{< ref "#background" >}}) section of the Streaming Mutation document. @@ -52,7 +52,7 @@ The *SHOW LOCKS* command has been altered to provide information about the new l A new option has been added to *ALTER TABLE* to request a compaction of a table or partition.  In general users do not need to request compactions, as the system will detect the need for them and initiate the compaction.  However, if [compaction is turned off]({{< ref "#compaction-is-turned-off" >}}) for a table or a user wants to compact the table at a time the system would not choose to, *ALTER TABLE* can be used to initiate the compaction.  See [Alter Table/Partition Compact]({{< ref "#alter-table/partition-compact" >}}) for details.  This will enqueue a request for compaction and return.  To watch the progress of the compaction the user can use *SHOW COMPACTIONS*. -A new command *ABORT TRANSACTIONS* has been added, see [Abort Transactions](https://hive.apache.org/docs/latest/language/languagemanual-ddl#abort-transactions) for details. +A new command *ABORT TRANSACTIONS* has been added, see [Abort Transactions](/docs/latest/language/languagemanual-ddl#abort-transactions) for details. ## Basic Design @@ -118,7 +118,7 @@ A new logical entity called "transaction manager"  was added which incorporated The length of time that the DbLockManger will continue to try to acquire locks can be controlled via [hive.lock.numretires](http://Configuration Properties#hive.lock.numretires) and [hive.lock.sleep.between.retries](http://Configuration Properties#hive.lock.sleep.between.retries).  When the DbLockManager cannot acquire a lock (due to existence of a competing lock), it will back off and try again after a certain time period.  In order to support short running queries and not overwhelm the metastore at the same time, the DbLockManager will double the wait time after each retry.  The initial back off time is 100ms and is capped by hive.lock.sleep.between.retries.  hive.lock.numretries is the total number of times it will retry a given lock request.  Thus the total time that the call to acquire locks will block (given values of 100 retries and 60s sleep time) is (100ms + 200ms + 400ms + ... + 51200ms + 60s + 60s + ... + 60s) = 91m:42s:300ms. -More [details](https://hive.apache.org/docs/latest/language/languagemanual-ddl#show-locks) on locks used by this Lock Manager. +More [details](/docs/latest/language/languagemanual-ddl#show-locks) on locks used by this Lock Manager. Note that the lock manager used by DbTxnManager will acquire locks on all tables, even those without "transactional=true" property.  By default, Insert operation into a non-transactional table will acquire an exclusive lock and thus block other inserts and reads.  While technically correct, this is a departure from how Hive traditionally worked (i.e. w/o a lock manger).  For backwards compatibility, [hive.txn.strict.locking.mode](http://Configuration Properties#hive.txn.strict.locking.mode) (see table below) is provided which will make this lock manager acquire shared locks on insert operations on non-transactional tables.  This restores previous semantics while still providing the benefit of a lock manager such as preventing table drop while it is being read.  Note that for transactional tables, insert always acquires share locks since these tables implement MVCC architecture at the storage layer and are able to provide strong read consistency (Snapshot Isolation) even in presence of concurrent modification operations. @@ -149,7 +149,7 @@ A number of new configuration parameters have been added to the system to suppor | [hive.txn.manager]({{< ref "#hive-txn-manager" >}})  | *Default:* org.apache.hadoop.hive.ql.lockmgr.DummyTxnManager*Value required for transactions:* org.apache.hadoop.hive.ql.lockmgr.DbTxnManager | Client/HiveServer2 | DummyTxnManager replicates pre Hive-0.13 behavior and provides no transactions. | | [hive.txn.strict.locking.mode]({{< ref "#hive-txn-strict-locking-mode" >}}) | *Default:* true | Client/ HiveServer2 | In strict mode non-ACID resources use standard R/W lock semantics, e.g. INSERT will acquire exclusive lock. In non-strict mode, for non-ACID resources, INSERT will only acquire shared lock, which allows two concurrent writes to the same partition but still lets lock manager prevent DROP TABLE etc. when the table is being written to (as of [Hive 2.2.0](https://issues.apache.org/jira/browse/HIVE-15774)). | | [hive.txn.timeout]({{< ref "#hive-txn-timeout" >}}) deprecated. Use metastore.txn.timeout instead | *Default:* 300 | Client/HiveServer2/Metastore  | Time after which transactions are declared aborted if the client has not sent a heartbeat, in seconds. It's critical that this property has the same value for all components/services.5 | -| [hive.txn.heartbeat.threadpool.size](https://hive.apache.org/docs/latest/user/configuration-properties#hivetxnheartbeatthreadpoolsize) deprecated - but still in use | *Default:* 5 | Client/HiveServer2 | The number of threads to use for heartbeating (as of [Hive 1.3.0 and 2.0.0](https://issues.apache.org/jira/browse/HIVE-12366)). | +| [hive.txn.heartbeat.threadpool.size](/docs/latest/user/configuration-properties#hivetxnheartbeatthreadpoolsize) deprecated - but still in use | *Default:* 5 | Client/HiveServer2 | The number of threads to use for heartbeating (as of [Hive 1.3.0 and 2.0.0](https://issues.apache.org/jira/browse/HIVE-12366)). | | [hive.timedout.txn.reaper.start]({{< ref "#hive-timedout-txn-reaper-start" >}}) deprecated | *Default:* 100s | Metastore | Time delay of first reaper (the process which aborts timed-out transactions) run after the metastore starts (as of [Hive 1.3.0](https://issues.apache.org/jira/browse/HIVE-11317)). Controls AcidHouseKeeperServcie above. | | [hive.timedout.txn.reaper.interval]({{< ref "#hive-timedout-txn-reaper-interval" >}}) deprecated | *Default:* 180s | Metastore | Time interval describing how often the reaper (the process which aborts timed-out transactions) runs (as of [Hive 1.3.0](https://issues.apache.org/jira/browse/HIVE-11317)). Controls AcidHouseKeeperServcie above. | | [hive.txn.max.open.batch]({{< ref "#hive-txn-max-open-batch" >}}) deprecated. Use metastore.txn.max.open.batch instead | *Default:* 1000 | Client | Maximum number of transactions that can be fetched in one call to open_txns().1 | @@ -189,7 +189,7 @@ A number of new configuration parameters have been added to the system to suppor | hive.compactor.history.retention.succeeded deprecated. Use metastore.compactor.history.retention.succeeded instead | Default: 3 | Metastore | Number of successful compaction entries to retain in history (per partition). | | hive.compactor.history.retention.failed deprecated. Use metastore.compactor.history.retention.failed instead. | Default: 3 | Metastore | Number of failed compaction entries to retain in history (per partition). | | hive.compactor.history.retention.attempted deprecated. Use metastore.compactor.history.retention.did.not.initiate instead. | Default: 2 | Metastore | Number of attempted compaction entries to retain in history (per partition). | -| hive.compactor.initiator.failed.compacts.threshold deprecated. Use metastore.compactor.initiator.failed.compacts.threshold instead. | Default: 2 | Metastore | Number of of consecutive failed compactions for a given partition after which the Initiator will stop attempting to schedule compactions automatically. It is still possible to use [ALTER TABLE](https://hive.apache.org/docs/latest/language/languagemanual-ddl#alter-tablepartition-compact) to initiate compaction. Once a manually initiated compaction succeeds auto initiated compactions will resume. Note that this must be less than hive.compactor.history.retention.failed. | +| hive.compactor.initiator.failed.compacts.threshold deprecated. Use metastore.compactor.initiator.failed.compacts.threshold instead. | Default: 2 | Metastore | Number of of consecutive failed compactions for a given partition after which the Initiator will stop attempting to schedule compactions automatically. It is still possible to use [ALTER TABLE](/docs/latest/language/languagemanual-ddl#alter-tablepartition-compact) to initiate compaction. Once a manually initiated compaction succeeds auto initiated compactions will resume. Note that this must be less than hive.compactor.history.retention.failed. | | metastore.compactor.initiator.failed.compacts.threshold | *Default*: 2 (Allowed between 1 and 20) | Metastore | Number of consecutive compaction failures (per table/partition) after which automatic compactions will not be scheduled any more.  Note that this must be less than hive.compactor.history.retention.failed. | | hive.compactor.history.reaper.interval deprecated. metastore.acid.housekeeper.interval handles it. | Default: 2m | Metastore | Controls how often the process to purge historical record of compactions runs. | | ACID metrics | | | | @@ -226,7 +226,7 @@ If the data in your system is not owned by the Hive user (i.e., the user that th ### Compaction pooling -More in formation on compaction pooling can be found here: [Compaction pooling](https://hive.apache.org/docs/latest/language/compaction-pooling) +More in formation on compaction pooling can be found here: [Compaction pooling](/docs/latest/language/compaction-pooling) ## Table Properties @@ -236,7 +236,7 @@ If a table owner does not wish the system to automatically determine when to com Table properties are set with the TBLPROPERTIES clause when a table is created or altered, as described in the [Create Table]({{< ref "#create-table" >}}) and [Alter Table Properties]({{< ref "#alter-table-properties" >}}) sections of Hive Data Definition Language. The "`transactional`" and "`NO_AUTO_COMPACTION`" table properties are case-insensitive. -More compaction related options can be set via TBLPROPERTIES. They can be set at both table-level via [CREATE TABLE](https://hive.apache.org/docs/latest/language/languagemanual-ddl#createdroptruncate-table), and on request-level via [ALTER TABLE/PARTITION COMPACT](https://hive.apache.org/docs/latest/language/languagemanual-ddl#alter-tablepartition-compact).  These are used to override the Warehouse/table wide settings.  For example, to override an MR property to affect a compaction job, one can add "compactor.=" in either CREATE TABLE statement or when launching a compaction explicitly via ALTER TABLE.  The "=" will be set on JobConf of the compaction MR job. Similarly, "tblprops.=" can be used to set/override any table property which is interpreted by the code running on the cluster.  Finally, "compactorthreshold.=" can be used to override properties from the "New Configuration Parameters for Transactions" table above that end with ".threshold" and control when compactions are triggered by the system.  Examples: +More compaction related options can be set via TBLPROPERTIES. They can be set at both table-level via [CREATE TABLE](/docs/latest/language/languagemanual-ddl#createdroptruncate-table), and on request-level via [ALTER TABLE/PARTITION COMPACT](/docs/latest/language/languagemanual-ddl#alter-tablepartition-compact).  These are used to override the Warehouse/table wide settings.  For example, to override an MR property to affect a compaction job, one can add "compactor.=" in either CREATE TABLE statement or when launching a compaction explicitly via ALTER TABLE.  The "=" will be set on JobConf of the compaction MR job. Similarly, "tblprops.=" can be used to set/override any table property which is interpreted by the code running on the cluster.  Finally, "compactorthreshold.=" can be used to override properties from the "New Configuration Parameters for Transactions" table above that end with ".threshold" and control when compactions are triggered by the system.  Examples: **Example: Set compaction options in TBLPROPERTIES at table level** diff --git a/content/docs/latest/user/accumulointegration.md b/content/docs/latest/user/accumulointegration.md index ffba8561..2d583aa3 100644 --- a/content/docs/latest/user/accumulointegration.md +++ b/content/docs/latest/user/accumulointegration.md @@ -11,7 +11,7 @@ date: 2024-12-12 ## Implementation -The initial implementation was added to Hive 0.14 in [HIVE-7068](https://issues.apache.org/jira/browse/HIVE-7068) and is designed to work with Accumulo 1.6.x. There are two main components which make up the implementation: the AccumuloStorageHandler and the AccumuloPredicateHandler. The AccumuloStorageHandler is a [StorageHandler](https://hive.apache.org/development/desingdocs/storagehandlers) implementation. The primary roles of this class are to manage the mapping of Hive table to Accumulo table and configures Hive queries. The AccumuloPredicateHandler is used push down filter operations to the Accumulo for more efficient reduction of data. +The initial implementation was added to Hive 0.14 in [HIVE-7068](https://issues.apache.org/jira/browse/HIVE-7068) and is designed to work with Accumulo 1.6.x. There are two main components which make up the implementation: the AccumuloStorageHandler and the AccumuloPredicateHandler. The AccumuloStorageHandler is a [StorageHandler](/development/desingdocs/storagehandlers) implementation. The primary roles of this class are to manage the mapping of Hive table to Accumulo table and configures Hive queries. The AccumuloPredicateHandler is used push down filter operations to the Accumulo for more efficient reduction of data. ## Accumulo Configuration diff --git a/content/docs/latest/user/avroserde.md b/content/docs/latest/user/avroserde.md index 95905376..e93df9e4 100644 --- a/content/docs/latest/user/avroserde.md +++ b/content/docs/latest/user/avroserde.md @@ -23,7 +23,7 @@ The AvroSerde allows users to read or write [Avro data](http://avro.apache.org/) * Transparently converts the Avro idiom of handling nullable types as Union[T, null] into just T and returns null when appropriate. * Writes any Hive table to Avro files. * Has worked reliably against our most convoluted Avro schemas in our ETL process. -* Starting in [Hive 0.14](https://issues.apache.org/jira/browse/HIVE-7446), columns can be added to an Avro backed Hive table using the [Alter Table](https://hive.apache.org/docs/latest/language/languagemanual-ddl#addreplace-columns) statement. +* Starting in [Hive 0.14](https://issues.apache.org/jira/browse/HIVE-7446), columns can be added to an Avro backed Hive table using the [Alter Table](/docs/latest/language/languagemanual-ddl#addreplace-columns) statement. For general information about SerDes, see [Hive SerDe]({{< ref "#hive-serde" >}}) in the Developer Guide. Also see [SerDe]({{< ref "serde" >}}) for details about input and output processing. diff --git a/content/docs/latest/user/configuration-properties.md b/content/docs/latest/user/configuration-properties.md index a6b9a083..50105ef9 100644 --- a/content/docs/latest/user/configuration-properties.md +++ b/content/docs/latest/user/configuration-properties.md @@ -9,7 +9,7 @@ This document describes the Hive user configuration properties (sometimes called The canonical list of configuration properties is managed in the `HiveConf` Java class, so refer to the `HiveConf.java` file for a complete list of configuration properties available in your Hive release. -For information about how to use these configuration properties, see [Configuring Hive]({{< ref "#configuring-hive" >}}). That document also describes administrative configuration properties for setting up Hive in the [Configuration Variables]({{< ref "#configuration-variables" >}}) section. [Hive Metastore Administration](https://hive.apache.org/docs/latest/admin/adminmanual-metastore-administration) describes additional configuration properties for the metastore. +For information about how to use these configuration properties, see [Configuring Hive]({{< ref "#configuring-hive" >}}). That document also describes administrative configuration properties for setting up Hive in the [Configuration Variables]({{< ref "#configuration-variables" >}}) section. [Hive Metastore Administration](/docs/latest/admin/adminmanual-metastore-administration) describes additional configuration properties for the metastore. Version information @@ -1688,7 +1688,7 @@ Pre-3.1.2 Hive implementation of Parquet stores timestamps in UTC on-file, this #### Avro -See [AvroSerDe](https://hive.apache.org/docs/latest/user/avroserde) for details. +See [AvroSerDe](/docs/latest/user/avroserde) for details. ##### hive.avro.timestamp.skip.conversion @@ -3132,10 +3132,10 @@ To configure Hive execution to Spark, set the following property to "`spark`": Besides the configuration properties listed in this section, some properties in other sections are also related to Spark: -* [hive.exec.reducers.max](https://hive.apache.org/docs/latest/user/configuration-properties#hiveexecreducersmax) -* [hive.exec.reducers.bytes.per.reducer](https://hive.apache.org/docs/latest/user/configuration-properties#hiveexecreducersbytesperreducer) -* [hive.mapjoin.optimized.hashtable](https://hive.apache.org/docs/latest/user/configuration-properties#hivemapjoinoptimizedhashtable) -* [hive.mapjoin.optimized.hashtable.wbsize](https://hive.apache.org/docs/latest/user/configuration-properties#hivemapjoinoptimizedhashtablewbsize) +* [hive.exec.reducers.max](/docs/latest/user/configuration-properties#hiveexecreducersmax) +* [hive.exec.reducers.bytes.per.reducer](/docs/latest/user/configuration-properties#hiveexecreducersbytesperreducer) +* [hive.mapjoin.optimized.hashtable](/docs/latest/user/configuration-properties#hivemapjoinoptimizedhashtable) +* [hive.mapjoin.optimized.hashtable.wbsize](/docs/latest/user/configuration-properties#hivemapjoinoptimizedhashtablewbsize) hive.spark.job.monitor.timeout diff --git a/content/docs/latest/user/cost-based-optimization-in-hive.md b/content/docs/latest/user/cost-based-optimization-in-hive.md index 8367b4e1..8114c8fb 100644 --- a/content/docs/latest/user/cost-based-optimization-in-hive.md +++ b/content/docs/latest/user/cost-based-optimization-in-hive.md @@ -118,10 +118,10 @@ In this document we propose to use Calcite’s cost based optimizer, Volcano, to ## STATS -* histogram_numeric(): Estimating frequency distributions:  -* histogram() UDAF for a numerical column  -* Built-in Aggregate Functions (UDAF):  -* Annotate hive operator tree with statistics from metastore:  +* histogram_numeric(): [Estimating frequency distributions](/docs/latest/language/statisticsanddatamining) +* histogram() UDAF for a numerical column [HIVE-1397](https://issues.apache.org/jira/browse/HIVE-1397) +* Built-in Aggregate Functions ([UDAF](/docs/latest/language/languagemanual-udf)) +* Annotate hive operator tree with statistics from metastore: [HIVE-5369](https://issues.apache.org/jira/browse/HIVE-5369)   @@ -261,7 +261,7 @@ Hive allows users to specify map/reduce/transform operator in the sql; data woul * Cluster By/Distribute By -*Cluster By* and *Distribute By* are used mainly with the [Transform/Map-Reduce Scripts](https://hive.apache.org/docs/latest/language/languagemanual-transform). But, it is sometimes useful in SELECT statements if there is a need to partition and sort the output of a query for subsequent queries. *Cluster By* is a short-cut for both *Distribute By* and *Sort By*. Hive uses the columns in *Distribute By* to distribute the rows among reducers. All rows with the same *Distribute By* columns will go to the same reducer. However, *Distribute By* does not guarantee clustering or sorting properties on the distributed keys. +*Cluster By* and *Distribute By* are used mainly with the [Transform/Map-Reduce Scripts](/docs/latest/language/languagemanual-transform). But, it is sometimes useful in SELECT statements if there is a need to partition and sort the output of a query for subsequent queries. *Cluster By* is a short-cut for both *Distribute By* and *Sort By*. Hive uses the columns in *Distribute By* to distribute the rows among reducers. All rows with the same *Distribute By* columns will go to the same reducer. However, *Distribute By* does not guarantee clustering or sorting properties on the distributed keys. * Table Sample diff --git a/content/docs/latest/user/data-connectors-in-hive.md b/content/docs/latest/user/data-connectors-in-hive.md index 2606facb..5a62ae7c 100644 --- a/content/docs/latest/user/data-connectors-in-hive.md +++ b/content/docs/latest/user/data-connectors-in-hive.md @@ -11,7 +11,7 @@ Data connectors (referred to as "connector" in Hive Query Language) are top leve -With [JDBC Storage Handlers](https://hive.apache.org/docs/latest/user/jdbc-storage-handler), users define a table in hive metastore for which data resides in a remote JDBC datastore. This hive table's metadata is persisted locally in hive metastore's backend. When Hiveserver2 runs a query against this table, data is retrieved from the remote JDBC table. While this is very powerful in itself, it has limitations. +With [JDBC Storage Handlers](/docs/latest/user/jdbc-storage-handler), users define a table in hive metastore for which data resides in a remote JDBC datastore. This hive table's metadata is persisted locally in hive metastore's backend. When Hiveserver2 runs a query against this table, data is retrieved from the remote JDBC table. While this is very powerful in itself, it has limitations. * Each remote table in remote datasource has to be individually mapped to a local hive table. It becomes tedious if you have to map an entire database of many tables. * Any new tables in the remote datasource are not automatically visible and will need to be manually mapped in hive. diff --git a/content/docs/latest/user/druid-integration.md b/content/docs/latest/user/druid-integration.md index 2e8c215a..278f71b7 100644 --- a/content/docs/latest/user/druid-integration.md +++ b/content/docs/latest/user/druid-integration.md @@ -223,7 +223,7 @@ Version Info Once we have created our first table stored in Druid using the `DruidStorageHandler`, we are ready to execute our queries against Druid. -When we express a query over a Druid table, Hive tries to *rewrite* the query to be executed efficiently by pushing as much computation as possible to Druid. This task is accomplished by the [cost optimizer](https://hive.apache.org/docs/latest/user/cost-based-optimization-in-hive) based in [Apache Calcite](http://calcite.apache.org/), which identifies patterns in the plan and apply rules to rewrite the input query into a new equivalent query with (hopefully) more operations executed in Druid. +When we express a query over a Druid table, Hive tries to *rewrite* the query to be executed efficiently by pushing as much computation as possible to Druid. This task is accomplished by the [cost optimizer](/docs/latest/user/cost-based-optimization-in-hive) based in [Apache Calcite](http://calcite.apache.org/), which identifies patterns in the plan and apply rules to rewrite the input query into a new equivalent query with (hopefully) more operations executed in Druid. In particular, we implemented our extension to the optimizer in [HIVE-14217](https://issues.apache.org/jira/browse/HIVE-14217), which builds upon the work initiated in [CALCITE-1121](https://issues.apache.org/jira/browse/CALCITE-1121), and extends its logic to identify more complex query patterns (*timeseries* queries), translate filters on the *time* dimension to Druid intervals, push limit into Druid *select* queries, etc. diff --git a/content/docs/latest/user/hive-deprecated-authorization-mode.md b/content/docs/latest/user/hive-deprecated-authorization-mode.md index 432eaa2f..8e8b5e92 100644 --- a/content/docs/latest/user/hive-deprecated-authorization-mode.md +++ b/content/docs/latest/user/hive-deprecated-authorization-mode.md @@ -64,11 +64,11 @@ User names are also case sensitive. Unlike role names, user names are not manag Quoted Identifiers in Version 0.13.0+ -As of Hive 0.13.0, user and role names may optionally be surrounded by backtick characters (`) when the configuration parameter `[hive.support.quoted.identifiers](https://hive.apache.org/docs/latest/user/configuration-properties#hivesupportquotedidentifiers)` is set to `column` (default value). All [Unicode](http://en.wikipedia.org/wiki/List_of_Unicode_characters) characters are permitted in the quoted identifiers, with double backticks (``) representing a backtick character. However when `[hive.support.quoted.identifiers](https://hive.apache.org/docs/latest/user/configuration-properties#hivesupportquotedidentifiers)` is set to `none`, or in Hive 0.12.0 and earlier, only alphanumeric and underscore characters are permitted in user names and role names. +As of Hive 0.13.0, user and role names may optionally be surrounded by backtick characters (\`) when the configuration parameter [`hive.support.quoted.identifiers`](/docs/latest/user/configuration-properties#hivesupportquotedidentifiers) is set to `column` (default value). All [Unicode](http://en.wikipedia.org/wiki/List_of_Unicode_characters) characters are permitted in the quoted identifiers, with double backticks (\`\`) representing a backtick character. However when [`hive.support.quoted.identifiers`](/docs/latest/user/configuration-properties#hivesupportquotedidentifiers) is set to `none`, or in Hive 0.12.0 and earlier, only alphanumeric and underscore characters are permitted in user names and role names. For details, see [HIVE-6013](https://issues.apache.org/jira/browse/HIVE-6013) and [Supporting Quoted Identifiers in Column Names](https://issues.apache.org/jira/secure/attachment/12618321/QuotedIdentifier.html). -As of [Hive 0.14](https://issues.apache.org/jira/browse/HIVE-8083), user may be optionally surrounded by backtick characters (`) irrespective of the `[hive.support.quoted.identifiers](https://hive.apache.org/docs/latest/user/configuration-properties#hivesupportquotedidentifiers)` setting. +As of [Hive 0.14](https://issues.apache.org/jira/browse/HIVE-8083), user may be optionally surrounded by backtick characters (\`) irrespective of the [`hive.support.quoted.identifiers`](/docs/latest/user/configuration-properties#hivesupportquotedidentifiers) setting. ### Creating/Dropping/Using Roles diff --git a/content/docs/latest/user/hive-transactions.md b/content/docs/latest/user/hive-transactions.md index 7ccaddc0..f97aca10 100644 --- a/content/docs/latest/user/hive-transactions.md +++ b/content/docs/latest/user/hive-transactions.md @@ -19,8 +19,8 @@ Transactions with ACID semantics have been added to Hive to address the followin 1. Streaming ingest of data.  Many users have tools such as [Apache Flume](http://flume.apache.org/), [Apache Storm](https://storm.incubator.apache.org/), or [Apache Kafka](http://kafka.apache.org/) that they use to stream data into their Hadoop cluster.  While these tools can write data at rates of hundreds or more rows per second, Hive can only add partitions every fifteen minutes to an hour.  Adding partitions more often leads quickly to an overwhelming number of partitions in the table.  These tools could stream data into existing partitions, but this would cause readers to get dirty reads (that is, they would see data written after they had started their queries) and leave many small files in their directories that would put pressure on the NameNode.  With this new functionality this use case will be supported while allowing readers to get a consistent view of the data and avoiding too many files. 2. Slow changing dimensions.  In a typical star schema data warehouse, dimensions tables change slowly over time.  For example, a retailer will open new stores, which need to be added to the stores table, or an existing store may change its square footage or some other tracked characteristic.  These changes lead to inserts of individual records or updates of records (depending on the strategy chosen).  Starting with 0.14, Hive is able to support this. -3. Data restatement.  Sometimes collected data is found to be incorrect and needs correction.  Or the first instance of the data may be an approximation (90% of servers reporting) with the full data provided later.  Or business rules may require that certain transactions be restated due to subsequent transactions (e.g., after making a purchase a customer may purchase a membership and thus be entitled to discount prices, including on the previous purchase).  Or a user may be contractually required to remove their customer’s data upon termination of their relationship.  Starting with Hive 0.14 these use cases can be supported via *[INSERT](https://hive.apache.org/docs/latest/language/languagemanual-dml), [UPDATE](https://hive.apache.org/docs/latest/language/languagemanual-dml#update)*, and *[DELETE](https://hive.apache.org/docs/latest/language/languagemanual-dml#delete)*. -4. Bulk updates using [SQL MERGE](https://hive.apache.org/docs/latest/language/languagemanual-dml#merge) statement. +3. Data restatement.  Sometimes collected data is found to be incorrect and needs correction.  Or the first instance of the data may be an approximation (90% of servers reporting) with the full data provided later.  Or business rules may require that certain transactions be restated due to subsequent transactions (e.g., after making a purchase a customer may purchase a membership and thus be entitled to discount prices, including on the previous purchase).  Or a user may be contractually required to remove their customer’s data upon termination of their relationship.  Starting with Hive 0.14 these use cases can be supported via *[INSERT](/docs/latest/language/languagemanual-dml), [UPDATE](/docs/latest/language/languagemanual-dml#update)*, and *[DELETE](/docs/latest/language/languagemanual-dml#delete)*. +4. Bulk updates using [SQL MERGE](/docs/latest/language/languagemanual-dml#merge) statement. ## Limitations @@ -33,14 +33,14 @@ Transactions with ACID semantics have been added to Hive to address the followin * The existing ZooKeeper and in-memory lock managers are not compatible with transactions.  There is no intention to address this issue.  See [Basic Design]({{< ref "#basic-design" >}}) below for a discussion of how locks are stored for transactions. * ~~Schema changes using ALTER TABLE is NOT supported for ACID tables. [HIVE-11421](https://issues.apache.org/jira/browse/HIVE-11421) is tracking it.~~  Fixed in 1.3.0/2.0.0. * Using Oracle as the Metastore DB and "datanucleus.connectionPoolingType=BONECP" may generate intermittent "No such lock.." and "No such transaction..." errors.  Setting "datanucleus.connectionPoolingType=DBCP" is recommended in this case. -* [LOAD DATA...](https://hive.apache.org/docs/latest/language/languagemanual-dml#loading-files-into-tables) statement is not supported with transactional tables.  (This was not properly enforced until [HIVE-16732](https://issues.apache.org/jira/browse/HIVE-16732)) +* [LOAD DATA...](/docs/latest/language/languagemanual-dml#loading-files-into-tables) statement is not supported with transactional tables.  (This was not properly enforced until [HIVE-16732](https://issues.apache.org/jira/browse/HIVE-16732)) ## Streaming APIs Hive offers APIs for streaming data ingest and streaming mutation: * [Hive HCatalog Streaming API]({{< ref "streaming-data-ingest" >}}) -* [Hive Streaming API](https://hive.apache.org/docs/latest/user/streaming-data-ingest-v2) (Since Hive 3) +* [Hive Streaming API](/docs/latest/user/streaming-data-ingest-v2) (Since Hive 3) * [HCatalog Streaming Mutation API]({{< ref "hcatalog-streaming-mutation-api" >}}) (available in Hive 2.0.0 and later) A comparison of these two APIs is available in the [Background]({{< ref "#background" >}}) section of the Streaming Mutation document. @@ -59,7 +59,7 @@ The *SHOW LOCKS* command has been altered to provide information about the new l A new option has been added to *ALTER TABLE* to request a compaction of a table or partition.  In general users do not need to request compactions, as the system will detect the need for them and initiate the compaction.  However, if [compaction is turned off]({{< ref "#compaction-is-turned-off" >}}) for a table or a user wants to compact the table at a time the system would not choose to, *ALTER TABLE* can be used to initiate the compaction.  See [Alter Table/Partition Compact]({{< ref "#alter-table/partition-compact" >}}) for details.  This will enqueue a request for compaction and return.  To watch the progress of the compaction the user can use *SHOW COMPACTIONS*. -A new command *ABORT TRANSACTIONS* has been added, see [Abort Transactions](https://hive.apache.org/docs/latest/language/languagemanual-ddl#abort-transactions) for details. +A new command *ABORT TRANSACTIONS* has been added, see [Abort Transactions](/docs/latest/language/languagemanual-ddl#abort-transactions) for details. ## Basic Design @@ -125,7 +125,7 @@ A new logical entity called "transaction manager"  was added which incorporated As of [Hive 1.3.0](https://issues.apache.org/jira/browse/HIVE-12529), the length of time that the DbLockManger will continue to try to acquire locks can be controlled via [hive.lock.numretires](http://Configuration Properties#hive.lock.numretires) and [hive.lock.sleep.between.retries](http://Configuration Properties#hive.lock.sleep.between.retries).  When the DbLockManager cannot acquire a lock (due to existence of a competing lock), it will back off and try again after a certain time period.  In order to support short running queries and not overwhelm the metastore at the same time, the DbLockManager will double the wait time after each retry.  The initial back off time is 100ms and is capped by hive.lock.sleep.between.retries.  hive.lock.numretries is the total number of times it will retry a given lock request.  Thus the total time that the call to acquire locks will block (given values of 100 retries and 60s sleep time) is (100ms + 200ms + 400ms + ... + 51200ms + 60s + 60s + ... + 60s) = 91m:42s:300ms. -More [details](https://hive.apache.org/docs/latest/language/languagemanual-ddl#show-locks) on locks used by this Lock Manager. +More [details](/docs/latest/language/languagemanual-ddl#show-locks) on locks used by this Lock Manager. Note that the lock manager used by DbTxnManager will acquire locks on all tables, even those without "transactional=true" property.  By default, Insert operation into a non-transactional table will acquire an exclusive lock and thus block other inserts and reads.  While technically correct, this is a departure from how Hive traditionally worked (i.e. w/o a lock manger).  For backwards compatibility, [hive.txn.strict.locking.mode](http://Configuration Properties#hive.txn.strict.locking.mode) (see table below) is provided which will make this lock manager acquire shared locks on insert operations on non-transactional tables.  This restores previous semantics while still providing the benefit of a lock manager such as preventing table drop while it is being read.  Note that for transactional tables, insert always acquires share locks since these tables implement MVCC architecture at the storage layer and are able to provide strong read consistency (Snapshot Isolation) even in presence of concurrent modification operations. @@ -157,7 +157,7 @@ A number of new configuration parameters have been added to the system to suppor | [hive.txn.manager]({{< ref "#hive-txn-manager" >}})  | *Default:* org.apache.hadoop.hive.ql.lockmgr.DummyTxnManager*Value required for transactions:* org.apache.hadoop.hive.ql.lockmgr.DbTxnManager | Client/HiveServer2 | DummyTxnManager replicates pre Hive-0.13 behavior and provides no transactions. | | [hive.txn.strict.locking.mode]({{< ref "#hive-txn-strict-locking-mode" >}}) | *Default:* true | Client/ HiveServer2 | In strict mode non-ACID resources use standard R/W lock semantics, e.g. INSERT will acquire exclusive lock. In non-strict mode, for non-ACID resources, INSERT will only acquire shared lock, which allows two concurrent writes to the same partition but still lets lock manager prevent DROP TABLE etc. when the table is being written to (as of [Hive 2.2.0](https://issues.apache.org/jira/browse/HIVE-15774)). | | [hive.txn.timeout]({{< ref "#hive-txn-timeout" >}})  | *Default:* 300 | Client/HiveServer2/Metastore  | Time after which transactions are declared aborted if the client has not sent a heartbeat, in seconds. It's critical that this property has the same value for all components/services.5 | -| [hive.txn.heartbeat.threadpool.size](https://hive.apache.org/docs/latest/user/configuration-properties#hivetxnheartbeatthreadpoolsize) | *Default:* 5 | Client/HiveServer2 | The number of threads to use for heartbeating (as of [Hive 1.3.0 and 2.0.0](https://issues.apache.org/jira/browse/HIVE-12366)). | +| [hive.txn.heartbeat.threadpool.size](/docs/latest/user/configuration-properties#hivetxnheartbeatthreadpoolsize) | *Default:* 5 | Client/HiveServer2 | The number of threads to use for heartbeating (as of [Hive 1.3.0 and 2.0.0](https://issues.apache.org/jira/browse/HIVE-12366)). | | [hive.timedout.txn.reaper.start]({{< ref "#hive-timedout-txn-reaper-start" >}}) | *Default:* 100s | Metastore | Time delay of first reaper (the process which aborts timed-out transactions) run after the metastore starts (as of [Hive 1.3.0](https://issues.apache.org/jira/browse/HIVE-11317)). Controls AcidHouseKeeperServcie above. | | [hive.timedout.txn.reaper.interval]({{< ref "#hive-timedout-txn-reaper-interval" >}}) | *Default:* 180s | Metastore | Time interval describing how often the reaper (the process which aborts timed-out transactions) runs (as of [Hive 1.3.0](https://issues.apache.org/jira/browse/HIVE-11317)). Controls AcidHouseKeeperServcie above. | | [hive.txn.max.open.batch]({{< ref "#hive-txn-max-open-batch" >}}) | *Default:* 1000 | Client | Maximum number of transactions that can be fetched in one call to open_txns().1 | @@ -180,7 +180,7 @@ A number of new configuration parameters have been added to the system to suppor | hive.compactor.history.retention.succeeded | *Default: 3* | Metastore | Number of successful compaction entries to retain in history (per partition). | | hive.compactor.history.retention.failed | *Default: 3* | Metastore | Number of failed compaction entries to retain in history (per partition). | | hive.compactor.history.retention.attempted | *Default: 2* | Metastore | Number of attempted compaction entries to retain in history (per partition). | -| hive.compactor.initiator.failed.compacts.threshold | *Default: 2* | Metastore | Number of of consecutive failed compactions for a given partition after which the Initiator will stop attempting to schedule compactions automatically. It is still possible to use [ALTER TABLE](https://hive.apache.org/docs/latest/language/languagemanual-ddl#alter-tablepartition-compact) to initiate compaction. Once a manually initiated compaction succeeds auto initiated compactions will resume. Note that this must be less than hive.compactor.history.retention.failed. | +| hive.compactor.initiator.failed.compacts.threshold | *Default: 2* | Metastore | Number of of consecutive failed compactions for a given partition after which the Initiator will stop attempting to schedule compactions automatically. It is still possible to use [ALTER TABLE](/docs/latest/language/languagemanual-ddl#alter-tablepartition-compact) to initiate compaction. Once a manually initiated compaction succeeds auto initiated compactions will resume. Note that this must be less than hive.compactor.history.retention.failed. | | hive.compactor.history.reaper.interval | *Default: 2m* | Metastore | Controls how often the process to purge historical record of compactions runs. | 1. hive.txn.max.open.batch controls how many transactions streaming agents such as Flume or Storm open simultaneously.  The streaming agent then writes that number of entries into a single file (per Flume agent or Storm bolt).  Thus increasing this value decreases the number of delta files created by streaming agents.  But it also increases the number of open transactions that Hive has to track at any given time, which may negatively affect read performance. @@ -209,7 +209,7 @@ If the data in your system is not owned by the Hive user (i.e., the user that th ### Compaction pooling -More in formation on compaction pooling can be found here: [Compaction pooling](https://hive.apache.org/docs/latest/language/compaction-pooling) +More in formation on compaction pooling can be found here: [Compaction pooling](/docs/latest/language/compaction-pooling) ## Table Properties @@ -219,7 +219,7 @@ If a table owner does not wish the system to automatically determine when to com Table properties are set with the TBLPROPERTIES clause when a table is created or altered, as described in the [Create Table]({{< ref "#create-table" >}}) and [Alter Table Properties]({{< ref "#alter-table-properties" >}}) sections of Hive Data Definition Language. The "`transactional`" and "`NO_AUTO_COMPACTION`" table properties are case-sensitive in Hive releases 0.x and 1.0, but they are case-insensitive starting with release 1.1.0 ([HIVE-8308](https://issues.apache.org/jira/browse/HIVE-8308)). -More compaction related options can be set via TBLPROPERTIES as of [Hive 1.3.0 and 2.1.0](https://issues.apache.org/jira/browse/HIVE-13354). They can be set at both table-level via [CREATE TABLE](https://hive.apache.org/docs/latest/language/languagemanual-ddl#createdroptruncate-table), and on request-level via [ALTER TABLE/PARTITION COMPACT](https://hive.apache.org/docs/latest/language/languagemanual-ddl#alter-tablepartition-compact).  These are used to override the Warehouse/table wide settings.  For example, to override an MR property to affect a compaction job, one can add "compactor.=" in either CREATE TABLE statement or when launching a compaction explicitly via ALTER TABLE.  The "=" will be set on JobConf of the compaction MR job.   Similarly, "tblprops.=" can be used to set/override any table property which is interpreted by the code running on the cluster.  Finally, "compactorthreshold.=" can be used to override properties from the "New Configuration Parameters for Transactions" table above that end with ".threshold" and control when compactions are triggered by the system.  Examples: +More compaction related options can be set via TBLPROPERTIES as of [Hive 1.3.0 and 2.1.0](https://issues.apache.org/jira/browse/HIVE-13354). They can be set at both table-level via [CREATE TABLE](/docs/latest/language/languagemanual-ddl#createdroptruncate-table), and on request-level via [ALTER TABLE/PARTITION COMPACT](/docs/latest/language/languagemanual-ddl#alter-tablepartition-compact).  These are used to override the Warehouse/table wide settings.  For example, to override an MR property to affect a compaction job, one can add "compactor.=" in either CREATE TABLE statement or when launching a compaction explicitly via ALTER TABLE.  The "=" will be set on JobConf of the compaction MR job.   Similarly, "tblprops.=" can be used to set/override any table property which is interpreted by the code running on the cluster.  Finally, "compactorthreshold.=" can be used to override properties from the "New Configuration Parameters for Transactions" table above that end with ".threshold" and control when compactions are triggered by the system.  Examples: **Example: Set compaction options in TBLPROPERTIES at table level** diff --git a/content/docs/latest/user/hiveserver2-overview.md b/content/docs/latest/user/hiveserver2-overview.md index c94884ab..b840e3af 100644 --- a/content/docs/latest/user/hiveserver2-overview.md +++ b/content/docs/latest/user/hiveserver2-overview.md @@ -7,7 +7,7 @@ date: 2024-12-12 # Introduction -HiveServer2 (HS2) is a service that enables clients to execute queries against Hive. HiveServer2 is the successor to [HiveServer1](https://hive.apache.org/docs/latest/admin/hiveserver) which has been deprecated. HS2 supports multi-client concurrency and authentication. It is designed to provide better support for open API clients like JDBC and ODBC. +HiveServer2 (HS2) is a service that enables clients to execute queries against Hive. HiveServer2 is the successor to [HiveServer1](/docs/latest/admin/hiveserver) which has been deprecated. HS2 supports multi-client concurrency and authentication. It is designed to provide better support for open API clients like JDBC and ODBC. HS2 is a single process running as a composite service, which includes the Thrift-based Hive service (TCP or HTTP) and a [Jetty](http://www.eclipse.org/jetty/) web server for web UI.  diff --git a/content/docs/latest/user/kudu-integration.md b/content/docs/latest/user/kudu-integration.md index 5be3050e..b4340867 100644 --- a/content/docs/latest/user/kudu-integration.md +++ b/content/docs/latest/user/kudu-integration.md @@ -13,7 +13,7 @@ date: 2024-12-12 The initial implementation was added to Hive 4.0 in [HIVE-12971](https://issues.apache.org/jira/browse/HIVE-12971) and is designed to work with Kudu 1.2+. -There are two main components which make up the implementation: the `KuduStorageHandler` and the `KuduPredicateHandler`. The `KuduStorageHandler` is a Hive [StorageHandler](https://hive.apache.org/development/desingdocs/storagehandlers) implementation. The primary roles of this class are to manage the mapping of a Hive table to a Kudu table and configures Hive queries. The KuduPredicateHandler is used push down filter operations to Kudu for more efficient IO. +There are two main components which make up the implementation: the `KuduStorageHandler` and the `KuduPredicateHandler`. The `KuduStorageHandler` is a Hive [StorageHandler](/development/desingdocs/storagehandlers) implementation. The primary roles of this class are to manage the mapping of a Hive table to a Kudu table and configures Hive queries. The KuduPredicateHandler is used push down filter operations to Kudu for more efficient IO. **NOTE**: The initial implementation is considered `***experimental***` as there are remaining [sub-jiras](https://issues.apache.org/jira/browse/HIVE-12971) open to make the implementation more configurable and performant. Currently only external tables pointing at existing Kudu tables are supported. Support for creating and altering underlying Kudu tables in tracked via [HIVE-22021](https://issues.apache.org/jira/browse/HIVE-22021). Additionally full support for UPDATE, UPSERT, and DELETE statement support is tracked by [HIVE-22027](https://issues.apache.org/jira/browse/HIVE-22027). @@ -54,7 +54,7 @@ Because Impala creates tables with the same storage handler metadata in the Hiv ## Data Ingest -Though it is a common practice to ingest the data into Kudu tables via tools like [Apache NiFi](https://nifi.apache.org/) or [Apache Spark](https://spark.apache.org/) and query the data via Hive, data can also be inserted to the Kudu tables via [Hive INSERT statements](https://hive.apache.org/docs/latest/language/languagemanual-dml#inserting-data-into-hive-tables-from-queries). It is important to note that when data is inserted a Kudu UPSERT operation is actually used to avoid primary key constraint issues. Making this more flexible is tracked via [HIVE-22024](https://issues.apache.org/jira/browse/HIVE-22024). Additionally UPDATE and DELETE operations are not supported. Enabling that functionality is tracked via [HIVE-22027](https://issues.apache.org/jira/browse/HIVE-22027). +Though it is a common practice to ingest the data into Kudu tables via tools like [Apache NiFi](https://nifi.apache.org/) or [Apache Spark](https://spark.apache.org/) and query the data via Hive, data can also be inserted to the Kudu tables via [Hive INSERT statements](/docs/latest/language/languagemanual-dml#inserting-data-into-hive-tables-from-queries). It is important to note that when data is inserted a Kudu UPSERT operation is actually used to avoid primary key constraint issues. Making this more flexible is tracked via [HIVE-22024](https://issues.apache.org/jira/browse/HIVE-22024). Additionally UPDATE and DELETE operations are not supported. Enabling that functionality is tracked via [HIVE-22027](https://issues.apache.org/jira/browse/HIVE-22027). ### Examples diff --git a/content/docs/latest/user/materialized-views-in-hive.md b/content/docs/latest/user/materialized-views-in-hive.md index f48ed6f6..5799a2b4 100644 --- a/content/docs/latest/user/materialized-views-in-hive.md +++ b/content/docs/latest/user/materialized-views-in-hive.md @@ -19,7 +19,7 @@ In this section, we present the main operations that are currently present in Hi ### Materialized views creation -The syntax to create a materialized view in Hive is very similar to the [CTAS statement](https://hive.apache.org/docs/latest/language/languagemanual-ddl#create-table-as-select-ctas) syntax, supporting common features such as partition columns, custom storage handler, or passing table properties. +The syntax to create a materialized view in Hive is very similar to the [CTAS statement](/docs/latest/language/languagemanual-ddl#create-table-as-select-ctas) syntax, supporting common features such as partition columns, custom storage handler, or passing table properties. | `CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db_name.]materialized_view_name``[DISABLE REWRITE]``[COMMENT materialized_view_comment]``[PARTITIONED ON (col_name, ...)]``[CLUSTERED ON (col_name, ...) | DISTRIBUTED ON (col_name, ...) SORTED ON (col_name, ...)]``[``[ROW FORMAT row_format]``[STORED AS file_format]``| STORED BY``'[storage.handler.class.name](http://storage.handler.class.name)'` `[WITH SERDEPROPERTIES (...)]``]``[LOCATION hdfs_path]``[TBLPROPERTIES (property_name=property_value, ...)]``AS``;` | @@ -29,7 +29,7 @@ By default, materialized views are usable for query rewriting by the optimizer, The default values for SerDe and storage format when they are not specified in the materialized view creation statement (they are optional) are specified using the configuration properties `hive.materializedview.serde` and `hive.materializedview.fileformat`, respectively. -Materialized views can be stored in external systems, e.g., [Druid](https://hive.apache.org/docs/latest/user/druid-integration), using custom storage handlers. For instance, the following statement creates a materialized view that is stored in Druid: +Materialized views can be stored in external systems, e.g., [Druid](/docs/latest/user/druid-integration), using custom storage handlers. For instance, the following statement creates a materialized view that is stored in Druid: **Example:** | `CREATE MATERIALIZED VIEW druid_wiki_mv``STORED AS``'org.apache.hadoop.hive.druid.DruidStorageHandler'``AS``SELECT __time, page, user, c_added, c_removed``FROM src;` | diff --git a/content/docs/latest/user/permission-inheritance-in-hive.md b/content/docs/latest/user/permission-inheritance-in-hive.md index fed65d2b..6f21eaec 100644 --- a/content/docs/latest/user/permission-inheritance-in-hive.md +++ b/content/docs/latest/user/permission-inheritance-in-hive.md @@ -35,7 +35,7 @@ See [HIVE-11481](https://issues.apache.org/jira/browse/HIVE-11481). ## **Behavior** -* When "[hive.warehouse.subdir.inherit.perms](https://hive.apache.org/docs/latest/user/configuration-properties#hivewarehousesubdirinheritperms)" flag is enabled in Hive, Hive will try to do all the following inheritances. +* When "[hive.warehouse.subdir.inherit.perms](/docs/latest/user/configuration-properties#hivewarehousesubdirinheritperms)" flag is enabled in Hive, Hive will try to do all the following inheritances. + Database directory inherits from warehouse directory. + Table directory inherits from database directory, or from warehouse directory if it is part of the default database. + External table directory inherits from parent directory. @@ -52,7 +52,7 @@ Most of this functionality was added as of Hive 0.14.   See umbrella JIRA [HIVE-6892](https://issues.apache.org/jira/browse/HIVE-6892) for details. -[hive.warehouse.subdir.inherit.perms](https://hive.apache.org/docs/latest/user/configuration-properties#hivewarehousesubdirinheritperms) was removed in Hive 3.0.0. The feature is no longer needed in Hive as the traditional permission model has largely been replaced by external security systems such as Ranger and Sentry. A user may choose SQLStdAuth which ships with Hive if user doesn't want to use an external security system. +[hive.warehouse.subdir.inherit.perms](/docs/latest/user/configuration-properties#hivewarehousesubdirinheritperms) was removed in Hive 3.0.0. The feature is no longer needed in Hive as the traditional permission model has largely been replaced by external security systems such as Ranger and Sentry. A user may choose SQLStdAuth which ships with Hive if user doesn't want to use an external security system. diff --git a/content/docs/latest/user/serde.md b/content/docs/latest/user/serde.md index 5b06a95b..3f1bb7f4 100644 --- a/content/docs/latest/user/serde.md +++ b/content/docs/latest/user/serde.md @@ -24,7 +24,7 @@ The Hive SerDe library is in org.apache.hadoop.hive.serde2. (The old SerDe lib * [RegEx]({{< ref "#regex" >}}) * [Thrift](http://thrift.apache.org/) * [Parquet]({{< ref "parquet" >}}) (Hive 0.13 and later) -* [CSV](https://hive.apache.org/docs/latest/user/csv-serde) (Hive 0.14 and later) +* [CSV](/docs/latest/user/csv-serde) (Hive 0.14 and later) * [JsonSerDe]({{< ref "#jsonserde" >}}) (Hive 0.12 and later in [hcatalog-core](https://github.com/apache/hive/blob/master/hcatalog/core/src/main/java/org/apache/hive/hcatalog/data/JsonSerDe.java)) Note: For Hive releases prior to 0.12, Amazon provides a JSON SerDe available at `s3://elasticmapreduce/samples/hive-ads/libs/jsonserde.jar`. diff --git a/content/docs/latest/user/streaming-data-ingest-v2.md b/content/docs/latest/user/streaming-data-ingest-v2.md index 98823c21..38e3c4e8 100644 --- a/content/docs/latest/user/streaming-data-ingest-v2.md +++ b/content/docs/latest/user/streaming-data-ingest-v2.md @@ -31,8 +31,8 @@ A few things are required to use streaming. 1. The following settings are required in hive-site.xml to enable ACID support for streaming: 1. **hive.txn.manager = org.apache.hadoop.hive.ql.lockmgr.DbTxnManager** - 2. **hive.compactor.initiator.on = true**(See more important details [here](https://hive.apache.org/docs/latest/user/hive-transactions#new-configuration-parameters-for-transactions)) - 3. **hive.compactor.cleaner.on = true** (From Hive 4.0.0 onwards. See more important details [here](https://hive.apache.org/docs/latest/user/hive-transactions#new-configuration-parameters-for-transactions)) + 2. **hive.compactor.initiator.on = true**(See more important details [here](/docs/latest/user/hive-transactions#new-configuration-parameters-for-transactions)) + 3. **hive.compactor.cleaner.on = true** (From Hive 4.0.0 onwards. See more important details [here](/docs/latest/user/hive-transactions#new-configuration-parameters-for-transactions)) 4. **hive.compactor.worker.threads** > **0** 2. *“stored as orc”* must be specified during [table creation]({{< ref "#table-creation" >}}). Only [ORC storage format]({{< ref "languagemanual-orc" >}}) is supported currently. 3. tblproperties("transactional"="true") must be set on the table during creation. @@ -77,7 +77,7 @@ HiveStreamingConnection API also supports 2 partitioning mode (static vs dynamic Transactions are implemented slightly differently than traditional database systems. Each transaction has an id and multiple transactions are grouped into a “Transaction Batch”. This helps grouping records from multiple transactions into fewer files (rather than 1 file per transaction). During hive streaming connection creation, transaction batch size can be specified via builder API. Transaction management is completely hidden behind the API, in most cases users do not have to worry about tuning the transaction batch size (which is an expert level setting and might not be honored in future release). Also the API automatically rolls over to next transaction batch on beginTransaction() invocation if the current transaction batch is exhausted. The recommendation is to leave the transaction batch size at default value of 1 and group several thousands records together under a each transaction. Since each transaction corresponds to a delta directory in the filesystem, committing transaction too often can end up creating too many small directories.  -Transactions in a TransactionBatch are eventually expired by the Metastore if not committed or aborted after [hive.txn.timeout](https://hive.apache.org/docs/latest/user/hive-transactions#new-configuration-parameters-for-transactions) secs. In order to keep the transactions alive, HiveStreamingConnection has a heartbeater thread which by default sends heartbeat after (hive.txn.timeout/2) intervals for all the open transactions.  +Transactions in a TransactionBatch are eventually expired by the Metastore if not committed or aborted after [hive.txn.timeout](/docs/latest/user/hive-transactions#new-configuration-parameters-for-transactions) secs. In order to keep the transactions alive, HiveStreamingConnection has a heartbeater thread which by default sends heartbeat after (hive.txn.timeout/2) intervals for all the open transactions.  See the [Javadoc for HiveStreamingConnection](http://hive.apache.org/javadocs/r3.0.0/api/org/apache/hive/streaming/HiveStreamingConnection.html) for more information.  @@ -85,7 +85,7 @@ See the [Javadoc for HiveStreamingConnection](http://hive.apache.org/javadocs/r Generally, the more records are included in each transaction the more throughput can be achieved.  It's common to commit either after a certain number of records or after a certain time interval, whichever comes first.  The later ensures that when event flow rate is variable, transactions don't stay open too long.  There is no practical limit on how much data can be included in a single transaction. The only concern is amount of data which will need to be replayed if the transaction fails. The concept of a TransactionBatch serves to reduce the number of files (and delta directories) created by HiveStreamingConnection API in the filesystem. Since all transactions in a given transaction batch write to the same physical file (per bucket), a partition can only be compacted up to the the level of the earliest transaction of any batch which contains an open transaction.  Thus TransactionBatches should not be made excessively large.  It makes sense to include a timer to close a TransactionBatch (even if it has unused transactions) after some amount of time. -The HiveStreamingConnection is highly optimized for write throughput ([Delta Streaming Optimizations](http://hive.apache.org/javadocs/r3.0.0/api/org/apache/hive/streaming/HiveStreamingConnection.Builder.html#withStreamingOptimizations-boolean-)) and as a result the delta files generated by Hive streaming ingest have many of the ORC features disabled (dictionary encoding, indexes, compression, etc.) to facilitate high throughput writes. When the compactor kicks in, these delta files get rewritten into read- and storage-optimized ORC format (enable dictionary encoding, indexes and compression). So it is recommended to configure the compactor more aggressively/frequently (refer to [Compactor](https://hive.apache.org/docs/latest/user/hive-transactions#compactor)) to generate compacted and optimized ORC files. +The HiveStreamingConnection is highly optimized for write throughput ([Delta Streaming Optimizations](http://hive.apache.org/javadocs/r3.0.0/api/org/apache/hive/streaming/HiveStreamingConnection.Builder.html#withStreamingOptimizations-boolean-)) and as a result the delta files generated by Hive streaming ingest have many of the ORC features disabled (dictionary encoding, indexes, compression, etc.) to facilitate high throughput writes. When the compactor kicks in, these delta files get rewritten into read- and storage-optimized ORC format (enable dictionary encoding, indexes and compression). So it is recommended to configure the compactor more aggressively/frequently (refer to [Compactor](/docs/latest/user/hive-transactions#compactor)) to generate compacted and optimized ORC files. ### Notes about the HiveConf Object diff --git a/content/docs/latest/user/streaming-data-ingest.md b/content/docs/latest/user/streaming-data-ingest.md index b4b22020..4147f8d2 100644 --- a/content/docs/latest/user/streaming-data-ingest.md +++ b/content/docs/latest/user/streaming-data-ingest.md @@ -7,7 +7,7 @@ date: 2024-12-12 # Hive 3 Streaming API -[Hive 3 Streaming API Documentation](https://hive.apache.org/docs/latest/user/streaming-data-ingest-v2) - new API available in Hive 3 +[Hive 3 Streaming API Documentation](/docs/latest/user/streaming-data-ingest-v2) - new API available in Hive 3 # Hive HCatalog Streaming API @@ -31,8 +31,8 @@ A few things are required to use streaming. 1. The following settings are required in hive-site.xml to enable ACID support for streaming: 1. **hive.txn.manager = org.apache.hadoop.hive.ql.lockmgr.DbTxnManager** - 2. **hive.compactor.initiator.on = true**(See more important details [here](https://hive.apache.org/docs/latest/user/hive-transactions#new-configuration-parameters-for-transactions)) - 3. **hive.compactor.cleaner.on = true** (From Hive 4.0.0 onwards. See more important details [here](https://hive.apache.org/docs/latest/user/hive-transactions#new-configuration-parameters-for-transactions)) + 2. **hive.compactor.initiator.on = true**(See more important details [here](/docs/latest/user/hive-transactions#new-configuration-parameters-for-transactions)) + 3. **hive.compactor.cleaner.on = true** (From Hive 4.0.0 onwards. See more important details [here](/docs/latest/user/hive-transactions#new-configuration-parameters-for-transactions)) 4. **hive.compactor.worker.threads** > **0** 2. *“stored as orc”* must be specified during [table creation]({{< ref "#table-creation" >}}). Only [ORC storage format]({{< ref "languagemanual-orc" >}}) is supported currently. 3. tblproperties("transactional"="true") must be set on the table during creation. @@ -74,7 +74,7 @@ TransactionBatch is used to write a series of transactions. There is one file cr For each transaction in the TxnBatch, the application calls *beginNextTransaction*, *write,* and then *commit* or *abort* as appropriate. See the [Javadoc](http://hive.apache.org/javadocs/r1.2.1/api/org/apache/hive/hcatalog/streaming/TransactionBatch.html) for details.  A Transaction cannot include data from more than one partition. -Transactions in a TransactionBatch are eventually expired by the Metastore if not committed or aborted after [hive.txn.timeout](https://hive.apache.org/docs/latest/user/hive-transactions#new-configuration-parameters-for-transactions) secs. TrasnactionBatch class provides a **heartbeat()** method to prolong the lifetime of unused transactions in the batch.  A good rule of thumb is to send call heartbeat() at (hive.txn.timeout/2) intervals after creating a TransactionBatch.  This is sufficient to keep an inactive transaction alive but not load the metastore unnecessarily. +Transactions in a TransactionBatch are eventually expired by the Metastore if not committed or aborted after [hive.txn.timeout](/docs/latest/user/hive-transactions#new-configuration-parameters-for-transactions) secs. TrasnactionBatch class provides a **heartbeat()** method to prolong the lifetime of unused transactions in the batch.  A good rule of thumb is to send call heartbeat() at (hive.txn.timeout/2) intervals after creating a TransactionBatch.  This is sufficient to keep an inactive transaction alive but not load the metastore unnecessarily. #### Usage Guidelines @@ -263,7 +263,7 @@ secureConn.close(); # Knowledge Base -* [Talks and Presentations](https://hive.apache.org/docs/latest/user/hive-transactions#talks-and-presentations) +* [Talks and Presentations](/docs/latest/user/hive-transactions#talks-and-presentations) * [Lessons learnt from NiFi streaming data to Hive transactional tables](https://community.hortonworks.com/articles/139876/lessons-learnt-from-nifi-streaming-data-to-hive-tr.html) diff --git a/content/docs/latest/user/tutorial.md b/content/docs/latest/user/tutorial.md index 6f3624ff..f2906226 100644 --- a/content/docs/latest/user/tutorial.md +++ b/content/docs/latest/user/tutorial.md @@ -101,7 +101,7 @@ The Types are organized in the following hierarchy (where the parent is a super This type hierarchy defines how the types are implicitly converted in the query language. Implicit conversion is allowed for types from child to an ancestor. So when a query expression expects type1 and the data is of type2, type2 is implicitly converted to type1 if type1 is an ancestor of type2 in the type hierarchy. Note that the type hierarchy allows the implicit conversion of STRING to DOUBLE. -Explicit type conversion can be done using the cast operator as shown in the [#Built In Functions](https://hive.apache.org/docs/latest/user/tutorial#built-in-functions) section below. +Explicit type conversion can be done using the cast operator as shown in the [#Built In Functions](/docs/latest/user/tutorial#built-in-functions) section below. ### Complex Types diff --git a/content/docs/latest/user/union-optimization.md b/content/docs/latest/user/union-optimization.md index bdcee8ca..d792f587 100644 --- a/content/docs/latest/user/union-optimization.md +++ b/content/docs/latest/user/union-optimization.md @@ -28,7 +28,7 @@ Union may have more than 2 parents. Let's say the output directory of the final file sink was dir_final. We will replace the output directories of subq1 and subq2 with dir_final/subquery_1 and dir_final/subquery_2, respectively. All other properties of the final file sink like gatherStats, etc. will also be copied. After this, we remove the union and everything below it. -The optimization is important for , but should also be useful in other cases. +The optimization is important for [Skewed Join Optimization](/development/desingdocs/skewed-join-optimization), but should also be useful in other cases. diff --git a/content/docs/latest/webhcat/webhcat-configure.md b/content/docs/latest/webhcat/webhcat-configure.md index a810863b..31d18537 100644 --- a/content/docs/latest/webhcat/webhcat-configure.md +++ b/content/docs/latest/webhcat/webhcat-configure.md @@ -99,7 +99,7 @@ Default values prior to Hive 0.11 are listed in the HCatalog 0.5.0 documentation Previous: [Installation]({{< ref "webhcat-installwebhcat" >}}) Next: [Reference]({{< ref "webhcat-reference" >}}) -Hive configuration: [Configuring Hive]({{< ref "adminmanual-configuration" >}}), [Hive Configuration Properties](https://hive.apache.org/docs/latest/user/configuration-properties), [Thrift Server Setup]({{< ref "#thrift-server-setup" >}}) +Hive configuration: [Configuring Hive]({{< ref "adminmanual-configuration" >}}), [Hive Configuration Properties](/docs/latest/user/configuration-properties), [Thrift Server Setup]({{< ref "#thrift-server-setup" >}})