diff --git a/Android.common.mk b/Android.common.mk
index 60459d16eba..8e9e10a238d 100644
--- a/Android.common.mk
+++ b/Android.common.mk
@@ -31,6 +31,7 @@ LOCAL_C_INCLUDES += \
 
 MESA_VERSION := $(shell cat $(MESA_TOP)/VERSION)
 LOCAL_CFLAGS += \
+	-O3 \
 	-Wno-error \
 	-Wno-unused-parameter \
 	-Wno-pointer-arith \
@@ -77,14 +78,23 @@ LOCAL_CFLAGS += \
 	-fvisibility=hidden \
 	-fno-math-errno \
 	-fno-trapping-math \
-	-Wno-sign-compare
+	-Wno-sign-compare \
+	-Wno-self-assign \
+	-Wno-constant-logical-operand \
+	-Wno-format \
+	-Wno-incompatible-pointer-types \
+	-Wno-enum-conversion
 
 LOCAL_CPPFLAGS += \
 	-D__STDC_CONSTANT_MACROS \
 	-D__STDC_FORMAT_MACROS \
 	-D__STDC_LIMIT_MACROS \
 	-Wno-error=non-virtual-dtor \
-	-Wno-non-virtual-dtor
+	-Wno-non-virtual-dtor	\
+	-Wno-delete-non-virtual-dtor \
+	-Wno-overloaded-virtual \
+	-Wno-missing-braces \
+	-Wno-deprecated-register
 
 # mesa requires at least c99 compiler
 LOCAL_CONLYFLAGS += \
diff --git a/Makefile.am b/Makefile.am
index e7e14f5b3cd..6d3c8cc19b4 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -22,6 +22,7 @@
 SUBDIRS = src
 
 AM_DISTCHECK_CONFIGURE_FLAGS = \
+	--enable-autotools \
 	--enable-dri \
 	--enable-dri3 \
 	--enable-egl \
diff --git a/Readme.md b/Readme.md
new file mode 100644
index 00000000000..5df295abc3a
--- /dev/null
+++ b/Readme.md
@@ -0,0 +1,2 @@
+Any security related issues should be reported by following the instructions here:
+https://01.org/security
diff --git a/VERSION b/VERSION
index 5bd94c44a5c..02f94dcfc16 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-19.0.0-devel
+19.0.5
diff --git a/bin/.cherry-ignore b/bin/.cherry-ignore
new file mode 100644
index 00000000000..6c3f01790e0
--- /dev/null
+++ b/bin/.cherry-ignore
@@ -0,0 +1,40 @@
+# Both of these were already merged with different shas
+da48cba61ef6fefb799bf96e6364b70dbf4ec712
+c812c740e60c14060eb89db66039111881a0f42f
+
+# The commit these fix was reverted from 19.0, but fixed for 19.1 due
+# to the number of fixes required to make that commit work
+8d8f80af3a17354508f2ec9d6559c915d5be351d
+0c0c69729b6d72a5297122856c8fe48510e90764
+0881e90c09965818b02e359474a6f7446b41d647
+b031c643491a92a5574c7a4bd659df33f2d89bb6
+
+# These were manually rebased by Jason, thanks!
+8ab95b849e66f3221d80a67eef2ec6e3730901a8
+5c30fffeec1732c21d600c036f95f8cdb1bb5487
+
+# This doesn't actually appliy to 19.0
+29179f58c6ba8099859ea25900214dbbd3814a92
+
+# This was superceeded by a manual backport from ken
+6981069fc805da1afc867ca3c905075d146d7ff9
+
+# This was manually backported
+0bc1942c9ddce4e796322a7561f06af5dec0decd
+
+# This doesn't need to be applied, it already seems to exist in stable.
+80dc78407d0d1e03ceddf8889b217e8fd113568d
+
+# This was backported manually
+4f18c43d1df64135e8968a7d4fbfd2c9918b76ae
+
+# These were de-nominated since they don't apply nicley
+88105375c978f9de82af8c654051e5aa16d61614
+c9358621276ae49162e58d4a16fe37abda6a347f
+
+# These are only for 19.1
+c3538ab5702ceeead284c2b5f9e700f3082c8135
+d2aa65eb1892f7b300ac24560f9dbda6b600b5a7
+78e35df52aa2f7d770f929a0866a0faa89c261a9
+0f1b070bad34c46c4bcc6c679fa533bf6b4b79e5
+ad2b4aa37806779bdfc15d704940136c3db21eb4
diff --git a/bin/get-pick-list.sh b/bin/get-pick-list.sh
index 15f0e7d4a34..8fa4f438771 100755
--- a/bin/get-pick-list.sh
+++ b/bin/get-pick-list.sh
@@ -13,12 +13,12 @@
 
 is_stable_nomination()
 {
-	git show --summary "$1" | grep -q -i -o "CC:.*mesa-stable"
+	git show --pretty=medium --summary "$1" | grep -q -i -o "CC:.*mesa-stable"
 }
 
 is_typod_nomination()
 {
-	git show --summary "$1" | grep -q -i -o "CC:.*mesa-dev"
+	git show --pretty=medium --summary "$1" | grep -q -i -o "CC:.*mesa-dev"
 }
 
 fixes=
diff --git a/bin/install_megadrivers.py b/bin/install_megadrivers.py
index d29b1911218..b5ac78887bf 100644
--- a/bin/install_megadrivers.py
+++ b/bin/install_megadrivers.py
@@ -35,7 +35,11 @@ def main():
     args = parser.parse_args()
 
     if os.path.isabs(args.libdir):
-        to = os.path.join(os.environ.get('DESTDIR', '/'), args.libdir[1:])
+        destdir = os.environ.get('DESTDIR')
+        if destdir:
+            to = os.path.join(destdir, args.libdir[1:])
+        else:
+            to = args.libdir
     else:
         to = os.path.join(os.environ['MESON_INSTALL_DESTDIR_PREFIX'], args.libdir)
 
@@ -45,7 +49,6 @@ def main():
         if os.path.lexists(to):
             os.unlink(to)
         os.makedirs(to)
-    shutil.copy(args.megadriver, master)
 
     for driver in args.drivers:
         abs_driver = os.path.join(to, driver)
@@ -67,7 +70,14 @@ def main():
                 name, ext = os.path.splitext(name)
         finally:
             os.chdir(ret)
+
+    # Remove meson-created master .so and symlinks
     os.unlink(master)
+    name, ext = os.path.splitext(master)
+    while ext != '.so':
+        if os.path.lexists(name):
+            os.unlink(name)
+        name, ext = os.path.splitext(name)
 
 
 if __name__ == '__main__':
diff --git a/configure.ac b/configure.ac
index 858da79f4d0..b288ecbd265 100644
--- a/configure.ac
+++ b/configure.ac
@@ -122,7 +122,7 @@ LLVM_REQUIRED_OPENCL=3.9.0
 LLVM_REQUIRED_R600=3.9.0
 LLVM_REQUIRED_RADEONSI=7.0.0
 LLVM_REQUIRED_RADV=7.0.0
-LLVM_REQUIRED_SWR=6.0.0
+LLVM_REQUIRED_SWR=7.0.0
 
 dnl Check for progs
 AC_PROG_CPP
@@ -1922,7 +1922,7 @@ if test x"$enable_dri3" = xyes; then
     dri3_modifier_modules="xcb-dri3 >= $XCBDRI3_MODIFIERS_REQUIRED xcb-present >= $XCBPRESENT_MODIFIERS_REQUIRED"
     PKG_CHECK_MODULES([XCB_DRI3_MODIFIERS], [$dri3_modifier_modules], [have_dri3_modifiers=yes], [have_dri3_modifiers=no])
 
-    if test "x$have_dri3_modifiers" == xyes; then
+    if test "x$have_dri3_modifiers" = xyes; then
         DEFINES="$DEFINES -DHAVE_DRI3_MODIFIERS"
     fi
 fi
@@ -2357,7 +2357,7 @@ if test "x$enable_xvmc" = xyes -o \
         "x$enable_omx_tizonia" = xyes -o \
         "x$enable_va" = xyes; then
     if echo $platforms | grep -q "x11"; then
-        PKG_CHECK_MODULES([VL], [x11-xcb xcb xcb-dri2 >= $XCBDRI2_REQUIRED])
+        PKG_CHECK_MODULES([VL], [x11-xcb xcb xcb-dri2 >= $XCBDRI2_REQUIRED libdrm >= $LIBDRM_REQUIRED])
     fi
     need_gallium_vl_winsys=yes
 fi
@@ -2845,8 +2845,8 @@ if test -n "$with_gallium_drivers"; then
 fi
 
 # XXX: Keep in sync with LLVM_REQUIRED_SWR
-AM_CONDITIONAL(SWR_INVALID_LLVM_VERSION, test "x$LLVM_VERSION" != x6.0.0 -a \
-                                              "x$LLVM_VERSION" != x6.0.1)
+AM_CONDITIONAL(SWR_INVALID_LLVM_VERSION, test "x$LLVM_VERSION" != x7.0.0 -a \
+                                              "x$LLVM_VERSION" != x7.0.1)
 
 if test "x$enable_llvm" = "xyes" -a "$with_gallium_drivers"; then
     llvm_require_version $LLVM_REQUIRED_GALLIUM "gallium"
@@ -2949,7 +2949,7 @@ if test "x$enable_llvm" = xyes; then
     dnl the LLVM library propagated in the Libs.private of the respective .pc
     dnl file which ensures complete dependency information when statically
     dnl linking.
-    if test "x$enable_glx" == xgallium-xlib; then
+    if test "x$enable_glx" = xgallium-xlib; then
         GL_PC_LIB_PRIV="$GL_PC_LIB_PRIV $LLVM_LIBS"
     fi
     if test "x$enable_gallium_osmesa" = xyes; then
diff --git a/docs/envvars.html b/docs/envvars.html
index c9733e65234..43d3a6cf169 100644
--- a/docs/envvars.html
+++ b/docs/envvars.html
@@ -338,6 +338,9 @@ <h3>VMware SVGA driver environment variables</h3>
 for details.
 <li>SVGA_EXTRA_LOGGING - if set, enables extra logging to the vmware.log file,
 such as the OpenGL program's name and command line arguments.
+<li>SVGA_NO_LOGGING - if set, disables logging to the vmware.log file.
+This is useful when using Valgrind because it otherwise crashes when
+initializing the host log feature.
 <li>See the driver code for other, lesser-used variables.
 </ul>
 
diff --git a/docs/relnotes/19.0.0.html b/docs/relnotes/19.0.0.html
index 1b4edd7ce76..ea22d660f37 100644
--- a/docs/relnotes/19.0.0.html
+++ b/docs/relnotes/19.0.0.html
@@ -32,7 +32,8 @@ <h1>Mesa 19.0.0 Release Notes / TBD</h1>
 
 <h2>SHA256 checksums</h2>
 <pre>
-TBD.
+  4c5b9c5227d37c1f6bdc786a6fa7ee7fbce40b2e8a87340c7d3234534ece3304  mesa-19.0.0.tar.gz
+  5a549dfb40ec31e5c36c47aadac04554cb2e2a8d144a046a378fc16da57e38f8  mesa-19.0.0.tar.xz
 </pre>
 
 
@@ -60,13 +61,2413 @@ <h2>New features</h2>
 <h2>Bug fixes</h2>
 
 <ul>
-<li>TBD</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=32211">Bug 32211</a> - [GLSL] lower_jumps with continue-statements in for-loops prevents loop unrolling</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=102349">Bug 102349</a> - nv4x crashing with plasmashell - gdb log included</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=102597">Bug 102597</a> - [Regression] mpv, high rendering times (two to three times higher)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=104297">Bug 104297</a> - [i965] Downward causes GPU hangs and misrendering on Haswell</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=104602">Bug 104602</a> - [apitrace] Graphical artifacts in Civilization VI on RX Vega</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105301">Bug 105301</a> - The big SKQP bug</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106577">Bug 106577</a> - broken rendering with nine and nouveau (GM107)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106595">Bug 106595</a> - [RADV] Rendering distortions only when MSAA is enabled</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107052">Bug 107052</a> - [Regression][bisected]. Crookz - The Big Heist Demo can't be launched despite the &quot;true&quot; flag in &quot;drirc&quot;</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107510">Bug 107510</a> - [GEN8+] up to 10% perf drop on several 3D benchmarks</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107626">Bug 107626</a> - [SNB] The graphical corruption and GPU hang occur sometimes on the piglit test &quot;arb_texture_multisample-large-float-texture&quot; with parameter --fp16</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107728">Bug 107728</a> - Wrong background in Sascha Willem's Multisampling Demo</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107842">Bug 107842</a> - &quot;invariant&quot; qualifier on outputs of GLSL ES fragment shader causes compilation error.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107856">Bug 107856</a> - i965 incorrectly calculates the number of layers for texture views (assert)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108114">Bug 108114</a> - [vulkancts] new VK_KHR_16bit_storage tests fail.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108116">Bug 108116</a> - [vulkancts] stencil partial clear tests fail.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108245">Bug 108245</a> - RADV/Vega: Low mip levels of large BCn textures get corrupted by vkCmdCopyBufferToImage</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108311">Bug 108311</a> - Query buffer object support is broken on r600.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108457">Bug 108457</a> - [OpenGL CTS] KHR-GL46.tessellation_shader.single.xfb_captures_data_from_correct_stage fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108560">Bug 108560</a> - Mesa 32 is built without sse</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108624">Bug 108624</a> - [regression][bisected] &quot;nir: Copy propagation between blocks&quot; regression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108630">Bug 108630</a> - [G965] piglit.spec.!opengl 1_2.tex3d-maxsize spins forever</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108635">Bug 108635</a> - Mesa master commit 68dc591af16ebb36814e4c187e4998948103c99c causes XWayland to segfault</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108636">Bug 108636</a> - test_optpass has use after free bug, failing with memory testing tools like address sanitizer</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108713">Bug 108713</a> - Gallium: use after free with transform feedback</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108734">Bug 108734</a> - Regression: [bisected] dEQP-GLES31.functional.tessellation.invariance.* start failing on r600</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108805">Bug 108805</a> - i965 regressions from EXT_texture_sRGB_R8</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108829">Bug 108829</a> - [meson] libglapi exports internal API</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108868">Bug 108868</a> - [BYT IVB] Tesselation test regressions</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108877">Bug 108877</a> - OpenGL CTS gl43 test cases were interrupted due to segment fault</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108894">Bug 108894</a> - [anv] vkCmdCopyBuffer() and vkCmdCopyQueryPoolResults() write-after-write hazard</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108909">Bug 108909</a> - Vkd3d test failure test_resolve_non_issued_query_data()</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108910">Bug 108910</a> - Vkd3d test failure test_multisample_array_texture()</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108911">Bug 108911</a> - Vkd3d test failure test_clear_render_target_view()</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108914">Bug 108914</a> - blocky shadow artifacts in The Forest with DXVK, RADV_DEBUG=nohiz fixes this</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108925">Bug 108925</a> - vkCmdCopyQueryPoolResults(VK_QUERY_RESULT_WAIT_BIT) for timestamps with large query count hangs</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108936">Bug 108936</a> - [ILK,G45,G965] Regressions from texture-format enums rework</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108943">Bug 108943</a> - Build fails on ppc64le with meson</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108961">Bug 108961</a> - make check test_replace_src_bitsize failure</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108974">Bug 108974</a> - make check DispatchSanity_test regression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108999">Bug 108999</a> - Calculating the scissors fields when the y is flipped (0 on top) can generate negative numbers that will cause assertion failure later on.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109023">Bug 109023</a> - error: inlining failed in call to always_inline ‘__m512 _mm512_and_ps(__m512, __m512)’: target specific option mismatch</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109072">Bug 109072</a> - GPU hang in blender 2.80</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109075">Bug 109075</a> - radv: New D3D boolean optimizations cause GPU hang in Witcher 3</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109081">Bug 109081</a> - [bisected] [HSW] Regression in clipping.user_defined.clip_* vulkancts tests</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109086">Bug 109086</a> - Crash software mesa with gl_select render mode</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109107">Bug 109107</a> - gallium/st/va: change va max_profiles when using Radeon VCN Hardware</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109129">Bug 109129</a> - format_types.h:1220: undefined reference to `_mm256_cvtps_ph'</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109151">Bug 109151</a> - [KBL-G][vulkan] dEQP-VK.texture.explicit_lod.2d.sizes.31x55_nearest_linear_mipmap_nearest_repeat failed verification.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109190">Bug 109190</a> - virgl: buffer flushing error with some dEQP tests [bisected]</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109202">Bug 109202</a> - nv50_ir.cpp:749:19: error: cannot use typeid with -fno-rtti</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109204">Bug 109204</a> - [regression, bisected] retroarch's crt-royale shader crash radv</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109229">Bug 109229</a> - glLinkProgram locks up for ~30 seconds</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109231">Bug 109231</a> - [nir] src/compiler/nir/nir_loop_analyze.c uninitialized variable</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109242">Bug 109242</a> - [RADV] The Witcher 3 system freeze</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109304">Bug 109304</a> - GfxBench AztecRuins Vulkan version Segfault</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109325">Bug 109325</a> - mesa: Need ability to retrieve command line of Meson configuration</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109328">Bug 109328</a> - [BSW BXT GLK] dEQP-VK.subgroups.arithmetic.subgroup regressions</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109353">Bug 109353</a> - [regression][bisected] &quot;nir: Switch to using 1-bit Booleans for almost everything&quot; regression with shared bools</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109401">Bug 109401</a> - [DXVK] Project Cars rendering problems</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109404">Bug 109404</a> - [ANV] The Witcher 3 shadows flickering</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109442">Bug 109442</a> - &quot;make check&quot; test anv_block_pool_no_free fails intermittently</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109443">Bug 109443</a> - Build failure with MSVC when using Scons &gt;= 3.0.2</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109449">Bug 109449</a> - [snb] quakespasm triggers a segmentation fault.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109451">Bug 109451</a> - [IVB,SNB] LINE_STRIPs following a TRIANGLE_FAN fail to use primitive restart</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109543">Bug 109543</a> - After upgrade mesa to 19.0.0~rc1 all vulkan based application stop working [&quot;vulkan-cube&quot; received SIGSEGV in radv_pipeline_init_blend_state at ../src/amd/vulkan/radv_pipeline.c:699]</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109561">Bug 109561</a> - [regression, bisected] code re-factor causing games to stutter or lock-up system</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109573">Bug 109573</a> - dEQP-VK.spirv_assembly.instruction.graphics.module.same_module</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109575">Bug 109575</a> - Mesa-19.0.0-rc1 : Computer Crashes trying to run anything Vulkan</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109581">Bug 109581</a> - [BISECTED] Nothing is Rendered on Sascha Willem's &quot;subpasses&quot; demo</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109594">Bug 109594</a> - totem assert failure: totem: src/intel/genxml/gen9_pack.h:72: __gen_uint: La declaración `v &lt;= max' no se cumple.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109597">Bug 109597</a> - wreckfest issues with transparent objects &amp; skybox</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109601">Bug 109601</a> - [Regression] RuneLite GPU rendering broken on 18.3.x</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109603">Bug 109603</a> - nir_instr_as_deref: Assertion `parent &amp;&amp; parent-&gt;type == nir_instr_type_deref' failed.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109698">Bug 109698</a> - dri.pc contents invalid when built with meson</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109717">Bug 109717</a> - [regression]  Cull distance tests asserting</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109735">Bug 109735</a> - [Regression] broken font with mesa_vulkan_overlay</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109759">Bug 109759</a> - [BISECTED][REGRESSION][IVB, HSW] Font rendering problem in OpenGL</li>
+
 </ul>
 
 <h2>Changes</h2>
 
 <ul>
-<li>TBD</li>
+ 
+<p>Adam Jackson (4):</p>
+<ul>
+  <li>glx: Demand success from CreateContext requests (v2)</li>
+  <li>specs: Remove GLES profile interaction text from GLX_MESA_query_renderer</li>
+  <li>specs: Remove GLX_RENDERER_ID_MESA from GLX_MESA_query_renderer</li>
+  <li>specs: Bump GLX_MESA_query_renderer to version 9</li>
+</ul>
+
+<p>Aditya Swarup (1):</p>
+<ul>
+  <li>i965: Lift restriction in external textures for EGLImage support</li>
+</ul>
+
+<p>Alejandro Piñeiro (3):</p>
+<ul>
+  <li>nir: remove unused variable</li>
+  <li>nir/xfb: don't assert when xfb_buffer/stride is present but not xfb_offset</li>
+  <li>nir/xfb: distinguish array of structs vs array of blocks</li>
+</ul>
+
+<p>Alex Deucher (3):</p>
+<ul>
+  <li>pci_ids: add new vega10 pci ids</li>
+  <li>pci_ids: add new vega20 pci id</li>
+  <li>pci_ids: add new VegaM pci id</li>
+</ul>
+
+<p>Alex Smith (1):</p>
+<ul>
+  <li>radv: Flush before vkCmdWriteTimestamp() if needed</li>
+</ul>
+
+<p>Alexander von Gluck IV (1):</p>
+<ul>
+  <li>egl/haiku: Fix reference to disp vs dpy</li>
+</ul>
+
+<p>Alok Hota (8):</p>
+<ul>
+  <li>swr/rast: Use gfxptr_t value in JitGatherVertices</li>
+  <li>swr/rast: Add annotator to interleave isa text</li>
+  <li>swr/rast: partial support for Tiled Resources</li>
+  <li>swr/rast: Unaligned and translations in gathers</li>
+  <li>swr/rast: Scope MEM_CLIENT enum for mem usages</li>
+  <li>swr/rast: New execution engine per JIT</li>
+  <li>swr/rast: Store cached files in multiple subdirs</li>
+  <li>swr/rast: bypass size limit for non-sampled textures</li>
+</ul>
+
+<p>Alyssa Rosenzweig (1):</p>
+<ul>
+  <li>util: Fix warning in u_cpu_detect on non-x86</li>
+</ul>
+
+<p>Andre Heider (4):</p>
+<ul>
+  <li>st/nine: fix stack corruption due to ABI mismatch</li>
+  <li>st/nine: plug thread related leaks</li>
+  <li>st/nine: clean up thead shutdown sequence a bit</li>
+  <li>d3dadapter9: use snprintf(..., "%s", ...) instead of strncpy</li>
+</ul>
+
+<p>Andres Gomez (8):</p>
+<ul>
+  <li>glsl/linker: complete documentation for assign_attribute_or_color_locations</li>
+  <li>docs: update 18.3 and add 19.x cycles for the release calendar</li>
+  <li>glsl: correct typo in GLSL compilation error message</li>
+  <li>editorconfig: Add max_line_length property</li>
+  <li>glsl/linker: specify proper direction in location aliasing error</li>
+  <li>docs: complete the calendar and release schedule documentation</li>
+  <li>bin/get-pick-list.sh: fix the oneline printing</li>
+  <li>bin/get-pick-list.sh: fix redirection in sh</li>
+</ul>
+
+<p>Andrii Simiklit (9):</p>
+<ul>
+  <li>intel/tools: avoid 'unused variable' warnings</li>
+  <li>compiler: avoid 'unused variable' warnings</li>
+  <li>i965: avoid 'unused variable' warnings</li>
+  <li>i965/batch: avoid reverting batch buffer if saved state is an empty</li>
+  <li>intel/tools: make sure the binary file is properly read</li>
+  <li>anv/pipeline: remove unnecessary null-pointer check</li>
+  <li>intel/batch-decoder: fix vertex buffer size calculation for gen&lt;8</li>
+  <li>intel/batch-decoder: fix a vb end address calculation</li>
+  <li>i965: re-emit index buffer state on a reset option change.</li>
+</ul>
+
+<p>Anuj Phogat (7):</p>
+<ul>
+  <li>i965/icl: Set Error Detection Behavior Control Bit in L3CNTLREG</li>
+  <li>anv/icl: Set Error Detection Behavior Control Bit in L3CNTLREG</li>
+  <li>anv/icl: Disable prefetching of sampler state entries</li>
+  <li>i965/icl: Fix L3 configurations</li>
+  <li>i965/icl: Set use full ways in L3CNTLREG</li>
+  <li>intel/icl: Set way_size_per_bank to 4</li>
+  <li>anv/icl: Set use full ways in L3CNTLREG</li>
+</ul>
+
+<p>Axel Davy (12):</p>
+<ul>
+  <li>st/nine: Allow 'triple buffering' with thread_submit</li>
+  <li>st/nine: Remove thread_submit warning</li>
+  <li>st/nine: Use helper to release swapchain buffers later</li>
+  <li>st/nine: Switch to presentation buffer if resize is detected</li>
+  <li>st/nine: Fix volumetexture dtor on ctor failure</li>
+  <li>st/nine: Bind src not dst in nine_context_box_upload</li>
+  <li>st/nine: Add src reference to nine_context_range_upload</li>
+  <li>st/nine: Increase the limit of cached ff shaders</li>
+  <li>st/nine: Immediately upload user provided textures</li>
+  <li>st/nine: Enable debug info if NDEBUG is not set</li>
+  <li>st/nine: Ignore window size if error</li>
+  <li>st/nine: Ignore multisample quality level if no ms</li>
+</ul>
+
+<p>Bart Oldeman (1):</p>
+<ul>
+  <li>gallium-xlib: query MIT-SHM before using it.</li>
+</ul>
+
+<p>Bas Nieuwenhuizen (41):</p>
+<ul>
+  <li>radv: Use structured intrinsics instead of indexing workaround for GFX9.</li>
+  <li>vulkan: Allow storage images in the WSI.</li>
+  <li>radv: Fix opaque metadata descriptor last layer.</li>
+  <li>radv: Clamp gfx9 image view extents to the allocated image extents.</li>
+  <li>radv: Align large buffers to the fragment size.</li>
+  <li>radv/android: Mark android WSI image as shareable.</li>
+  <li>radv/android: Use buffer metadata to determine scanout compat.</li>
+  <li>radv: Check for shareable images in central place.</li>
+  <li>radv: Remove redundant format check.</li>
+  <li>radv: Fix multiview depth clears</li>
+  <li>radv: Work around non-renderable 128bpp compressed 3d textures on GFX9.</li>
+  <li>radv: Fix wrongly positioned paren.</li>
+  <li>radv: Do a cache flush if needed before reading predicates.</li>
+  <li>radv: Implement buffer stores with less than 4 components.</li>
+  <li>anv/android: Do not reject storage images.</li>
+  <li>radv: Remove device path.</li>
+  <li>radv: Remove unused variable.</li>
+  <li>amd/common: Add some parentheses to silence warning.</li>
+  <li>radv: Fix rasterization precision bits.</li>
+  <li>spirv: Fix matrix parameters in function calls.</li>
+  <li>freedreno: Move register constant files to src/freedreno.</li>
+  <li>radv: Only use 32 KiB per threadgroup on Stoney.</li>
+  <li>radv: Set partial_vs_wave for pipelines with just GS, not tess.</li>
+  <li>nir: Account for atomics in copy propagation.</li>
+  <li>radv: Remove unused variable.</li>
+  <li>radv/winsys: Set winsys bo priority on creation.</li>
+  <li>radv/winsys: Add priority handling during submit.</li>
+  <li>radv: Enable VK_EXT_memory_priority.</li>
+  <li>radv: Fix the shader info pass for not having the variable.</li>
+  <li>amd/common: Fix stores to derefs with unknown variable.</li>
+  <li>amd/common: Add gep helper for pointer increment.</li>
+  <li>amd/common: Handle nir_deref_type_ptr_as_array for shared memory.</li>
+  <li>amd/common: handle nir_deref_cast for shared memory from integers.</li>
+  <li>radv: Only look at pImmutableSamples if the descriptor has a sampler.</li>
+  <li>amd/common: Use correct writemask for shared memory stores.</li>
+  <li>radv: Sync ETC2 whitelisted devices.</li>
+  <li>radv: Fix float16 interpolation set up.</li>
+  <li>radv: Allow interpolation on non-float types.</li>
+  <li>radv: Handle clip+cull distances more generally as compact arrays.</li>
+  <li>radv: Fix rebase issue in 19.0 for float16 fix.</li>
+  <li>radv: Interpolate less aggressively.</li>
+</ul>
+
+<p>Boyan Ding (3):</p>
+<ul>
+  <li>gk110/ir: Add rcp f64 implementation</li>
+  <li>gk110/ir: Add rsq f64 implementation</li>
+  <li>gk110/ir: Use the new rcp/rsq in library</li>
+</ul>
+
+<p>Brian Paul (3):</p>
+<ul>
+  <li>svga: add new gallium formats to the format conversion table</li>
+  <li>mesa: fix display list corner case assertion</li>
+  <li>svga: remove SVGA_RELOC_READ flag in SVGA3D_BindGBSurface()</li>
+</ul>
+
+<p>Bruce Cherniak (1):</p>
+<ul>
+  <li>gallium/swr: Fix multi-context sync fence deadlock.</li>
+</ul>
+
+<p>Caio Marcelo de Oliveira Filho (10):</p>
+<ul>
+  <li>nir: properly clear the entry sources in copy_prop_vars</li>
+  <li>nir: properly find the entry to keep in copy_prop_vars</li>
+  <li>nir: add a way to print the deref chain</li>
+  <li>nir: remove dead code from copy_prop_vars</li>
+  <li>nir: fix warning in nir_lower_io.c</li>
+  <li>util: Helper to create sets and hashes with pointer keys</li>
+  <li>src/compiler: use new hash table and set creation helpers</li>
+  <li>src/intel: use new hash table and set creation helpers</li>
+  <li>nir: check NIR_SKIP to skip passes by name</li>
+  <li>gallium: Add PIPE_CAP_GLSL_TESS_LEVELS_AS_INPUTS</li>
+</ul>
+
+<p>Carlos Garnacho (1):</p>
+<ul>
+  <li>wayland/egl: Ensure EGL surface is resized on DRI update_buffers()</li>
+</ul>
+
+<p>Carsten Haitzler (Rasterman) (2):</p>
+<ul>
+  <li>vc4: Use named parameters for the NEON inline asm.</li>
+  <li>vc4: Declare the cpu pointers as being modified in NEON asm.</li>
+</ul>
+
+<p>Chad Versace (1):</p>
+<ul>
+  <li>i965: Fix -Wswitch on INTEL_COPY_STREAMING_LOAD</li>
+</ul>
+
+<p>Chia-I Wu (2):</p>
+<ul>
+  <li>meson: fix EGL/X11 build without GLX</li>
+  <li>freedreno/drm: sync uapi again</li>
+</ul>
+
+<p>Christian Gmeiner (6):</p>
+<ul>
+  <li>nir: add lowering for ffloor</li>
+  <li>etnaviv: drop redundant ctx function parameter</li>
+  <li>meson: add etnaviv to the tools option</li>
+  <li>etnaviv: extend etna_resource with an addressing mode</li>
+  <li>etnaviv: update headers from rnndb</li>
+  <li>etnaviv: add linear sampling support</li>
+</ul>
+
+<p>Connor Abbott (4):</p>
+<ul>
+  <li>Revert "radv: disable VK_SUBGROUP_FEATURE_VOTE_BIT"</li>
+  <li>nir/algebraic: Rewrite bit-size inference</li>
+  <li>nir/algebraic: Add unit tests for bitsize validation</li>
+  <li>nir: Fixup algebraic test for variable-sized conversions</li>
+</ul>
+
+<p>Daniel Stone (1):</p>
+<ul>
+  <li>gbm: Clarify acceptable formats for gbm_bo</li>
+</ul>
+
+<p>Danylo Piliaiev (9):</p>
+<ul>
+  <li>i965: Fix calculation of layers array length for isl_view</li>
+  <li>nir: add if opt opt_if_loop_last_continue()</li>
+  <li>glsl/linker: Fix unmatched TCS outputs being reduced to local variable</li>
+  <li>glsl: Make invariant outputs in ES fragment shader not to cause error</li>
+  <li>glsl: Fix copying function's out to temp if dereferenced by array</li>
+  <li>anv: Implement VK_KHR_draw_indirect_count for gen 7+</li>
+  <li>anv: Implement VK_EXT_conditional_rendering for gen 7.5+</li>
+  <li>anv: Fix VK_EXT_transform_feedback working with varyings packed in PSIZ</li>
+  <li>anv: Fix destroying descriptor sets when pool gets reset</li>
+</ul>
+
+<p>Dave Airlie (19):</p>
+<ul>
+  <li>radv: apply xfb buffer offset at buffer binding time not later. (v2)</li>
+  <li>radv: fix begin/end transform feedback with 0 counter buffers.</li>
+  <li>virgl: fix vtest regression since fencing changes.</li>
+  <li>spirv/vtn: handle variable pointers without offset lowering</li>
+  <li>nir: move getting deref from var after we check deref type.</li>
+  <li>nir: handle shared pointers in lowering indirect derefs.</li>
+  <li>ac: avoid casting pointers on bcsel and stores</li>
+  <li>radv: handle loading from shared pointers</li>
+  <li>ac: handle cast derefs</li>
+  <li>r600: make suballocator 256-bytes align</li>
+  <li>virgl: fix undefined shift to use unsigned.</li>
+  <li>virgl: fix const warning on debug flags.</li>
+  <li>radv: use 3d shader for gfx9 copies if dst is 3d</li>
+  <li>radv/xfb: fix counter buffer bounds checks.</li>
+  <li>virgl/vtest: fix front buffer flush with protocol version 0.</li>
+  <li>virgl: use primconvert provoking vertex properly</li>
+  <li>dri_interface: add put shm image2 (v2)</li>
+  <li>glx: add support for putimageshm2 path (v2)</li>
+  <li>gallium: use put image shm2 path (v2)</li>
+</ul>
+
+<p>David Shao (1):</p>
+<ul>
+  <li>meson: ensure that xmlpool_options.h is generated for gallium targets that need it</li>
+</ul>
+
+<p>Dieter Nützel (1):</p>
+<ul>
+  <li>docs/features: Delete double nv50 entry and wrong enumeration</li>
+</ul>
+
+<p>Dylan Baker (48):</p>
+<ul>
+  <li>meson: link gallium nine with pthreads</li>
+  <li>meson: Don't set -Wall</li>
+  <li>meson: fix libatomic tests</li>
+  <li>meson: Add tests to suites</li>
+  <li>util: promote u_memory to src/util</li>
+  <li>meson: Add nir_algebraic_parser_test to suites</li>
+  <li>meson: Fix ppc64 little endian detection</li>
+  <li>meson: remove duplicate definition</li>
+  <li>meson: Add support for gnu hurd</li>
+  <li>meson: Add toggle for glx-direct</li>
+  <li>docs/meson: Recommend not using CFLAGS and friends</li>
+  <li>travis: meson: use native files to override llvm-config</li>
+  <li>travis: Don't try to read libdrm out of configure.ac</li>
+  <li>travis: meson: enable unit tests</li>
+  <li>docs: add note about using backticks for rbs in gitlab</li>
+  <li>docs/install: Add meson to the main install page</li>
+  <li>docs/meson: Update LLVM section with information about native files</li>
+  <li>docs/install: Update python dependency section</li>
+  <li>docs/autoconf: Mark autoconf as being replaced</li>
+  <li>meson: Override C++ standard to gnu++11 when building with altivec on ppc64</li>
+  <li>meson: Error out if building nouveau and using LLVM without rtti</li>
+  <li>autotools: Remove tegra vdpau driver</li>
+  <li>meson: Add a script to extract the cmd line used for meson</li>
+  <li>meson: allow building dri driver without window system if osmesa is classic</li>
+  <li>bin/meson-cmd-extract: Also handle cross and native files</li>
+  <li>meson: fix swr KNL build</li>
+  <li>meson: Fix compiler checks for SWR with ICC</li>
+  <li>meson: Add warnings and errors when using ICC</li>
+  <li>automake: Fix path to generated source</li>
+  <li>automake: Add float64.glsl to dist tarball</li>
+  <li>automake: Add include dir for nir src directory</li>
+  <li>configure: Bump SWR LLVM requirement to 7</li>
+  <li>automake: Add --enable-autotools to distcheck flags</li>
+  <li>android,autotools,i965: Fix location of float64_glsl.h</li>
+  <li>VERSION: bump to 19.0.0-rc1</li>
+  <li>Version: Bump for rc2</li>
+  <li>cherry-ignore: Add some patches</li>
+  <li>Revert "intel/compiler: More peephole_select for pre-Gen6"</li>
+  <li>Revert "nir/opt_peephole_select: Don't peephole_select expensive math instructions"</li>
+  <li>Revert "intel/compiler: More peephole select"</li>
+  <li>Bump version for 19.0-rc3</li>
+  <li>version: bump for 19.0-rc4</li>
+  <li>get-pick-list: Add --pretty=medium to the arguments for Cc patches</li>
+  <li>meson: Add dependency on genxml to anvil</li>
+  <li>Version: update to 19.0-rc5</li>
+  <li>Bump version for rc6</li>
+  <li>VERSION: bump version for rc7</li>
+  <li>cherry-ignore: Update the cherry-ignore file</li>
+</ul>
+
+<p>Eduardo Lima Mitev (2):</p>
+<ul>
+  <li>freedreno/ir3: Make imageStore use num components from image format</li>
+  <li>freedreno/ir3: Handle GL_NONE in get_num_components_for_glformat()</li>
+</ul>
+
+<p>Eleni Maria Stea (1):</p>
+<ul>
+  <li>i965: fixed clamping in set_scissor_bits when the y is flipped</li>
+</ul>
+
+<p>Elie Tournier (17):</p>
+<ul>
+  <li>glsl: Add "built-in" function to do abs(fp64)</li>
+  <li>glsl: Add "built-in" functions to do neg(fp64)</li>
+  <li>glsl: Add "built-in" function to do sign(fp64)</li>
+  <li>glsl: Add "built-in" functions to do eq/ne(fp64, fp64)</li>
+  <li>glsl: Add utility function to extract 64-bit sign</li>
+  <li>glsl: Add "built-in" functions to do lt(fp64, fp64)</li>
+  <li>glsl: Add "built-in" functions to do add(fp64, fp64)</li>
+  <li>glsl: Add "built-in" functions to do mul(fp64, fp64)</li>
+  <li>glsl: Add "built-in" functions to do fp64_to_uint(fp64)</li>
+  <li>glsl: Add "built-in" functions to do uint_to_fp64(uint)</li>
+  <li>glsl: Add "built-in" functions to do fp64_to_int(fp64)</li>
+  <li>glsl: Add "built-in" functions to do int_to_fp64(int)</li>
+  <li>glsl: Add "built-in" functions to do fp64_to_fp32(fp64)</li>
+  <li>glsl: Add "built-in" functions to do fp32_to_fp64(fp32)</li>
+  <li>glsl: Add "built-in" functions to do sqrt(fp64)</li>
+  <li>glsl: Add "built-in" functions to do trunc(fp64)</li>
+  <li>glsl: Add "built-in" functions to do round(fp64)</li>
+</ul>
+
+<p>Emil Velikov (81):</p>
+<ul>
+  <li>mesa: bump version to 19.1.0-devel</li>
+  <li>docs: add 19.0.0-devel release notes template</li>
+  <li>docs: mention EXT_shader_implicit_conversions</li>
+  <li>egl: add EGL_EXT_device_base entrypoints</li>
+  <li>egl/glvnd: correctly report errors when vendor cannot be found</li>
+  <li>docs/releasing.html: polish cherry-picking/testing text</li>
+  <li>docs/submittingpatches.html: correctly handle the &lt;p&gt; tag</li>
+  <li>docs: document the staging branch and add reference to it</li>
+  <li>bin/get-pick-list.sh: simplify git oneline printing</li>
+  <li>bin/get-pick-list.sh: prefix output with "[stable] "</li>
+  <li>bin/get-pick-list.sh: handle "typod" usecase.</li>
+  <li>bin/get-pick-list.sh: handle the fixes tag</li>
+  <li>bin/get-pick-list.sh: tweak the commit sha matching pattern</li>
+  <li>bin/get-pick-list.sh: flesh out is_sha_nomination</li>
+  <li>bin/get-pick-list.sh: handle fixes tag with missing colon</li>
+  <li>bin/get-pick-list.sh: handle unofficial "broken by" tag</li>
+  <li>bin/get-pick-list.sh: use test instead of [ ]</li>
+  <li>bin/get-pick-list.sh: handle reverts prior to the branchpoint</li>
+  <li>travis: drop unneeded x11proto-xf86vidmode-dev</li>
+  <li>glx: make xf86vidmode mandatory for direct rendering</li>
+  <li>travis: adding missing x11-xcb for meson+vulkan</li>
+  <li>egl/wayland: bail out when drmGetMagic fails</li>
+  <li>egl/wayland: plug memory leak in drm_handle_device()</li>
+  <li>docs: update 18.3.0 release notes</li>
+  <li>docs: add sha256 checksums for 18.3.0</li>
+  <li>docs: update calendar, add news item and link release notes for 18.3.0</li>
+  <li>freedreno: drop duplicate MKDIR_GEN declaration</li>
+  <li>freedreno: add the missing _la in libfreedreno_ir3_la</li>
+  <li>amd/addrlib: drop si_ci_vi_merged_enum.h from the list</li>
+  <li>docs: add release notes for 18.3.1</li>
+  <li>docs: add sha256 checksums for 18.3.1</li>
+  <li>docs: update calendar, add news item and link release notes for 18.3.1</li>
+  <li>glx: mandate xf86vidmode only for "drm" dri platforms</li>
+  <li>bin/get-pick-list.sh: rework handing of sha nominations</li>
+  <li>bin/get-pick-list.sh: warn when commit lists invalid sha</li>
+  <li>meson: don't require glx/egl/gbm with gallium drivers</li>
+  <li>pipe-loader: meson: reference correct library</li>
+  <li>TODO: glx: meson: build dri based glx tests, only with -Dglx=dri</li>
+  <li>glx: meson: drop includes from a link-only library</li>
+  <li>glx: meson: wire up the dispatch-index-check test</li>
+  <li>glx/test: meson: assorted include fixes</li>
+  <li>configure: add CXX11_CXXFLAGS to LLVM_CXXFLAGS</li>
+  <li>travis: flip to distro xenial, drop sudo false</li>
+  <li>travis: meson: print the configured state</li>
+  <li>travis: printout llvm-config --version</li>
+  <li>travis: meson: use FOO_DRIVERS directly</li>
+  <li>travis: meson: add unwind handling</li>
+  <li>travis: meson: explicitly control the DRI loaders</li>
+  <li>travis: meson: add explicit handling to gallium ST</li>
+  <li>travis: meson: port gallium build combinations over</li>
+  <li>docs: add release notes for 18.3.2</li>
+  <li>docs: add sha256 checksums for 18.3.2</li>
+  <li>docs: update calendar, add news item and link release notes for 18.3.2</li>
+  <li>freedreno: automake: ship ir3_nir_trig.py in the tarball</li>
+  <li>mesa: correctly use os.path.join in our python scripts</li>
+  <li>Revert "mesa/main: remove ARB suffix from glGetnTexImage"</li>
+  <li>mapi: sort static entrypoints numerically</li>
+  <li>mapi: add all _glapi_table entrypoints to static_data.py</li>
+  <li>genCommon.py: Fix typo in _LIBRARY_FEATURE_NAMES.</li>
+  <li>mapi: move genCommon.py to src/mapi/new</li>
+  <li>mapi/new: import mapi scripts from glvnd</li>
+  <li>mapi/new: sort by slot number</li>
+  <li>mapi/new: use the static_data offsets in the new generator</li>
+  <li>mapi/new: reinstate _NO_HIDDEN suffixes in the new generator</li>
+  <li>mapi/new: split out public_entries handling</li>
+  <li>mapi/new: don't print info we don't need for ES1/ES2</li>
+  <li>mapi/new: fixup the GLDEBUGPROCKHR typedef to the non KHR one</li>
+  <li>mapi/new: remove duplicate GLvoid/void substitution</li>
+  <li>autotools: wire the new generator for es1 and es2</li>
+  <li>meson: wire the new generator for es1 and es2</li>
+  <li>scons: wire the new generator for es1 and es2</li>
+  <li>Revert "mapi/new: sort by slot number"</li>
+  <li>mapi/es*api: remove GL_OES_EGL_image entrypoints</li>
+  <li>mapi/es*api: remove GL_EXT_multi_draw_arrays entrypoints</li>
+  <li>mapi/es2api: remove no longer present entrypoints</li>
+  <li>mapi: remove old, unused ES* generator code</li>
+  <li>mapi: remove machinery handling CSV files</li>
+  <li>mapi: print function declarations for shared glapi</li>
+  <li>vc4: Declare the last cpu pointer as being modified in NEON asm.</li>
+  <li>anv: wire up the state_pool_padding test</li>
+  <li>meson: egl: correctly manage loader/xmlconfig</li>
+</ul>
+
+<p>Eric Anholt (171):</p>
+<ul>
+  <li>v3d: Fix a copy-and-paste comment in the simulator code.</li>
+  <li>v3d: Fix a typo in a comment in job handling.</li>
+  <li>v3d: Drop #if 0-ed out v3d_dump_to_file().</li>
+  <li>v3d: Respect user-passed strides for BO imports.</li>
+  <li>v3d: Take advantage of _mesa_hash_table_remove_key() in the simulator.</li>
+  <li>v3d: Use the TLB R/B swapping instead of recompiles when available.</li>
+  <li>v3d: Update the TLB config for depth writes on V3D 4.2.</li>
+  <li>vc4: Drop the winsys_stride relayout in the simluator</li>
+  <li>v3d: Maintain a mapping of the GEM buffer in the simulator.</li>
+  <li>v3d: Remove the special path for simulaton of the submit ioctl.</li>
+  <li>vc4: Take advantage of _mesa_hash_table_remove_key() in the simulator.</li>
+  <li>vc4: Maintain a separate GEM mapping of BOs in the simulator.</li>
+  <li>vc4: Use the normal simulator ioctl path for CL submit as well.</li>
+  <li>gbm: Move gbm_format_canonicalize() to the core.</li>
+  <li>gbm: Introduce a helper function for printing GBM format names.</li>
+  <li>egl: Improve the debugging of gbm format matching in DRI configs.</li>
+  <li>v3d: Fix double-swapping of R/B on V3D 4.1</li>
+  <li>v3d: Don't try to set PF flags on a LDTMU operation</li>
+  <li>vc4: Make sure we make ro scanout resources for create_with_modifiers.</li>
+  <li>vc4: Don't return a vc4 BO handle on a renderonly screen.</li>
+  <li>glx: Remove an old DEFAULT_DRIVER_DIR default.</li>
+  <li>glx: Move DRI extensions pointer loading to driOpenDriver().</li>
+  <li>egl: Move loader_set_logger() up to egl_dri2.c.</li>
+  <li>loader: Stop using a local definition for an in-tree header</li>
+  <li>loader: Factor out the common driver opening logic from each loader.</li>
+  <li>egl: Print the actual message to the console from _eglError().</li>
+  <li>gallium: Fix uninitialized variable warning in compute test.</li>
+  <li>gallium: Remove unused variable in u_tests.</li>
+  <li>v3d: Add renderonly support.</li>
+  <li>v3d: Add support for RGBA_SRGB along with BGRA_SRGB.</li>
+  <li>v3d: Add missing OES_half_float_linear support.</li>
+  <li>v3d: Use combined input/output segments.</li>
+  <li>v3d: Add the V3D TFU submit interface to the simulator.</li>
+  <li>v3d: Use the TFU to do generatemipmap.</li>
+  <li>v3d: Update simulator cache flushing code to match the kernel better.</li>
+  <li>v3d: Create a state uploader for packing our shaders together.</li>
+  <li>v3d: Put default vertex attribute values into the state uploader as well.</li>
+  <li>v3d: Re-use the wrap mode uniform on V3D 3.3.</li>
+  <li>v3d: Make an array for frag/vert texture state in the context.</li>
+  <li>v3d: Don't forget to flush writes to UBOs.</li>
+  <li>v3d: Convert to using nir_src_as_uint() from const_value derefs.</li>
+  <li>v3d: Fix a comment typo</li>
+  <li>v3d: Return the right gl_SampleMaskIn[] value.</li>
+  <li>v3d: Fix handling of texture first_layer offsets for 3D textures.</li>
+  <li>v3d: Avoid confusing auto-indenting in TEXTURE_SHADER_STATE packing</li>
+  <li>v3d: Split most of TEXTURE_SHADER_STATE setup out of sampler views.</li>
+  <li>v3d: Garbage collect unused uniforms code.</li>
+  <li>v3d: Simplify VIR uniform dumping using a temporary.</li>
+  <li>v3d: Add VIR dumping of TMU config p0/p1.</li>
+  <li>v3d: Fix a leak of the transfer helper on screen destroy.</li>
+  <li>vc4: Fix a leak of the transfer helper on screen destroy.</li>
+  <li>v3d: Fix a leak of the disassembled instruction string during debug dumps.</li>
+  <li>tfu</li>
+  <li>shader-packing</li>
+  <li>nir: Add some more consts to the nir_format_convert.h helpers.</li>
+  <li>nir: Pull some of intel's image load/store format conversion to nir_format.h</li>
+  <li>intel: Simplify the half-float packing in image load/store lowering.</li>
+  <li>mesa/st: Expose compute shaders when NIR support is advertised.</li>
+  <li>nir: Print the format of image variables.</li>
+  <li>Revert "intel: Simplify the half-float packing in image load/store lowering."</li>
+  <li>nir: Move intel's half-float image store lowering to to nir_format.h.</li>
+  <li>v3d: Don't forget to wait for our TFU job before rendering from it.</li>
+  <li>v3d: Set up the right stride for raster TFU.</li>
+  <li>v3d: Don't forget to bump the number of writes when doing TFU ops.</li>
+  <li>v3d: Add support for using the TFU to do some blits.</li>
+  <li>v3d: Add support for texturing from linear.</li>
+  <li>v3d: Add safety checks for resource_create().</li>
+  <li>v3d: Make sure that a thrsw doesn't split a multop from its umul24.</li>
+  <li>v3d: Add missing flagging of SYNCB as a TSY op.</li>
+  <li>v3d: Add support for draw indirect for GLES3.1.</li>
+  <li>v3d: Avoid assertion failures when removing end-of-shader instructions.</li>
+  <li>v3d: Move uinfo-&gt;data[] dereference to the top of v3d_write_uniforms().</li>
+  <li>v3d: Move uniform pretty-printing to its own helper function.</li>
+  <li>v3d: Use the uniform pretty-printer in v3d_write_uniforms()'s debug code.</li>
+  <li>v3d: Do uniform pretty-printing in the QPU dump.</li>
+  <li>v3d: Drop in a bunch of notes about performance improvement opportunities.</li>
+  <li>vc4: Use the original bit size when scalarizing uniform loads.</li>
+  <li>v3d: Use the original bit size when scalarizing uniform loads.</li>
+  <li>vc4: Reuse nir_format_convert.h in our blend lowering.</li>
+  <li>v3d: Fix the argument type for vir_BRANCH().</li>
+  <li>nir: Fix clamping of uints for image store lowering.</li>
+  <li>v3d: Put the dst bo first in the list of BOs for TFU calls.</li>
+  <li>v3d: Fix check for TFU job completion in the simulator.</li>
+  <li>v3d: Don't try to create shadow tiled temporaries for 1D textures.</li>
+  <li>v3d: Remove dead prototypes for load/store utile functions.</li>
+  <li>v3d: Implement texture_subdata to reduce teximage upload copies.</li>
+  <li>vc4: Move the utile load/store functions to a header for reuse by v3d.</li>
+  <li>v3d: Add a fallthrough path for utile load/store of 32 byte lines.</li>
+  <li>v3d: Load and store aligned utiles all at once.</li>
+  <li>docs: Add a note that MRs should still include any r-b or a-b tags.</li>
+  <li>docs: Add an encouraging note about providing reviews and acks.</li>
+  <li>v3d: Fix simulator mode on i915 render nodes.</li>
+  <li>v3d: Drop shadow comparison state from shader variant key.</li>
+  <li>v3d: Hook up perf_debug() output to GL_ARB_debug output as well.</li>
+  <li>vc4: Hook up perf_debug() output to GL_ARB_debug_output as well.</li>
+  <li>gallium/ttn: Fix setup of outputs_written.</li>
+  <li>v3d: Fix uniform pretty printing assertion failure with branches.</li>
+  <li>v3d: Add a "precompile" debug flag for shader-db.</li>
+  <li>v3d: Hook up some shader-db output to GL_ARB_debug_output.</li>
+  <li>v3d: Drop unused count_nir_instrs() helper.</li>
+  <li>v3d: Drop incorrect dependency for flpop.</li>
+  <li>v3d: Move "does this instruction have flags" from sched to generic helpers.</li>
+  <li>v3d: Don't generate temps for comparisons.</li>
+  <li>v3d: Dead-code eliminate unused flags updates.</li>
+  <li>v3d: Add a note for a potential performance win on multop/umul24.</li>
+  <li>v3d: Force sampling from base level for tg4.</li>
+  <li>v3d: Add support for non-constant texture offsets.</li>
+  <li>v3d: Add support for requesting the sample offsets.</li>
+  <li>v3d: Add support for textureSize() on MSAA textures.</li>
+  <li>v3d: Add support for gl_HelperInvocation.</li>
+  <li>v3d: Fix segfault when failing to compile a program.</li>
+  <li>v3d: Don't forget to include RT writes in precompiles.</li>
+  <li>v3d: Simplify the emission of comparisons for the bcsel optimization.</li>
+  <li>v3d: Move the "Find the ALU instruction generating our bool" out of bcsel.</li>
+  <li>v3d: Don't try to fold non-SSA-src comparisons into bcsels.</li>
+  <li>v3d: Fold comparisons for IF conditions into the flags for the IF.</li>
+  <li>v3d: Handle dynamically uniform IF statements with uniform control flow.</li>
+  <li>v3d: Refactor compiler entrypoints.</li>
+  <li>v3d: Reinstate the new shader-db output after v3d_compile() refactor.</li>
+  <li>v3d: Fix up VS output setup during precompiles.</li>
+  <li>v3d: Remove dead switch cases and comments from v3d_nir_lower_io.</li>
+  <li>v3d: Do UBO loads a vector at a time.</li>
+  <li>v3d: Stop scalarizing our uniform loads.</li>
+  <li>nir: Allow nir_format_unpack_int/sint to unpack larger values.</li>
+  <li>nir: Add nir_lower_tex options to lower sampler return formats.</li>
+  <li>v3d: Use the core tex lowering.</li>
+  <li>nir: Add nir_lower_tex support for Broadcom's swizzled TG4 results.</li>
+  <li>v3d: Enable GL_ARB_texture_gather on V3D 4.x.</li>
+  <li>nir: Make nir_deref_instr_build/get_const_offset actually use size_align.</li>
+  <li>glsl: Fix buffer overflow with an atomic buffer binding out of range.</li>
+  <li>v3d: Add support for flushing dirty TMU data at job end.</li>
+  <li>v3d: Add support for the early_fragment_tests flag.</li>
+  <li>v3d: Add support for GL_ARB_framebuffer_no_attachments.</li>
+  <li>v3d: Fix txf_ms 2D_ARRAY array index.</li>
+  <li>v3d: Add an isr to the simulator to catch GMP violations.</li>
+  <li>v3d: Add support for matrix inputs to the FS.</li>
+  <li>v3d: Drop the GLSL version level.</li>
+  <li>v3d: Add SSBO/atomic counters support.</li>
+  <li>v3d: Add support for shader_image_load_store.</li>
+  <li>v3d: Add support for CS workgroup/invocation id intrinsics.</li>
+  <li>v3d: Add support for CS shared variable load/store/atomics.</li>
+  <li>v3d: Add support for CS barrier() intrinsics.</li>
+  <li>v3d: SHARED but not necessarily SCANOUT buffers on RO must be linear.</li>
+  <li>v3d: If the modifier is not known on BO import, default to linear for RO.</li>
+  <li>v3d: Restructure RO allocations using resource_from_handle.</li>
+  <li>v3d: Don't leak the GPU fd for renderonly usage.</li>
+  <li>vc4: Don't leak the GPU fd for renderonly usage.</li>
+  <li>gallium: Enable unit tests as actual meson unit tests.</li>
+  <li>gallium: Fix comment about possible colorspaces.</li>
+  <li>gallium: Make sure we return is_unorm/is_snorm for compressed formats.</li>
+  <li>v3d: Rename gallium-local limits defines from VC5 to V3D.</li>
+  <li>v3d: Fix overly-large vattr_sizes structs.</li>
+  <li>v3d: Avoid duplicating limits defines between gallium and v3d core.</li>
+  <li>v3d: Drop maximum number of texture units down to 16.</li>
+  <li>v3d: Fix BO stats accounting for imported buffers.</li>
+  <li>v3d: Flush blit jobs immediately after generating them.</li>
+  <li>v3d: Fix release-build warning about utile_h.</li>
+  <li>v3d: Fix stencil sampling from packed depth/stencil.</li>
+  <li>v3d: Fix stencil sampling from a separate-stencil buffer.</li>
+  <li>v3d: Use the symbolic names for wrap modes from the XML.</li>
+  <li>v3d: Move the sampler state to the long-lived state uploader.</li>
+  <li>v3d: Create separate sampler states for the various blend formats.</li>
+  <li>pl111: Rename the pl111 driver to "kmsro".</li>
+  <li>kmsro: Extend to include hx8357d.</li>
+  <li>vc4: Enable NEON asm on meson cross-builds.</li>
+  <li>v3d: Fix the autotools build.</li>
+  <li>mesa: Skip partial InvalidateFramebuffer of packed depth/stencil.</li>
+  <li>v3d: Fix image_load_store clamping of signed integer stores.</li>
+  <li>v3d: Use the early_fragment_tests flag for the shader's disable-EZ field.</li>
+  <li>v3d: Fix the check for "is the last thrsw inside control flow"</li>
+  <li>st/dri: Set the PIPE_BIND_SHARED flag on create_image_with_modifiers.</li>
+</ul>
+
+<p>Eric Engestrom (47):</p>
+<ul>
+  <li>wsi/wayland: use proper VkResult type</li>
+  <li>wsi/wayland: only finish() a successfully init()ed display</li>
+  <li>REVIEWERS: add include path for EGL</li>
+  <li>REVIEWERS: add Emil as EGL reviewer</li>
+  <li>REVIEWERS: add Vulkan reviewer group</li>
+  <li>xmlpool: update translation po files</li>
+  <li>meson: only run vulkan's meson.build when building vulkan</li>
+  <li>gbm: remove unnecessary meson include</li>
+  <li>meson: fix wayland-less builds</li>
+  <li>gbm: add new entrypoint to symbols check</li>
+  <li>egl: add missing glvnd entrypoint for EGL_ANDROID_blob_cache</li>
+  <li>egl: fix bad rebase</li>
+  <li>gbm: add missing comma between strings</li>
+  <li>glapi: add missing visibility args</li>
+  <li>anv: correctly use vulkan 1.0 by default</li>
+  <li>vulkan/utils: s/VERSION/PACKAGE_VERSION/</li>
+  <li>build: stop defining unused VERSION</li>
+  <li>wsi/display: fix mem leak when freeing swapchains</li>
+  <li>vulkan/wsi: fix s/,/;/ typo</li>
+  <li>meson: skip asm check when asm is disabled</li>
+  <li>anv: add unreachable() for VK_EXT_fragment_density_map</li>
+  <li>mesa: drop unused &amp; deprecated lib</li>
+  <li>loader: deduplicate logger function declaration</li>
+  <li>docs: add meson cross compilation instructions</li>
+  <li>docs: format code blocks a bit nicely</li>
+  <li>docs: fix the meson aarch64 cross-file</li>
+  <li>docs: advertise distro-provided meson cross-files</li>
+  <li>anv: drop unneeded KHR suffix</li>
+  <li>wsi: drop unneeded KHR suffix</li>
+  <li>radv: remove a few more unnecessary KHR suffixes</li>
+  <li>egl: add missing includes</li>
+  <li>egl: remove unused include</li>
+  <li>travis: avoid using unset llvm-config</li>
+  <li>egl: fix python lib deprecation warning</li>
+  <li>docs: explain how to see what meson options exist</li>
+  <li>travis: fix autotools build after --enable-autotools switch addition</li>
+  <li>configure: EGL requirements only apply if EGL is built</li>
+  <li>egl: finalize EGL_MESA_query_driver</li>
+  <li>egl: update headers from Khronos</li>
+  <li>egl: add glvnd entrypoints for EGL_MESA_query_driver</li>
+  <li>travis: bump libdrm to 2.4.97</li>
+  <li>egl/glvnd: sync egl.xml from Khronos</li>
+  <li>anv: drop always-successful VkResult</li>
+  <li>meson/vdpau: add missing soversion</li>
+  <li>xvmc: fix string comparison</li>
+  <li>xvmc: fix string comparison</li>
+  <li>egl: fix libdrm-less builds</li>
+</ul>
+
+<p>Erik Faye-Lund (70):</p>
+<ul>
+  <li>glsl: add has_implicit_conversions()-helper</li>
+  <li>glsl: add has_implicit_uint_to_int_conversion()-helper</li>
+  <li>glsl: fall back to inexact function-match</li>
+  <li>mesa/glsl: add support for EXT_shader_implicit_conversions</li>
+  <li>glsl: do not allow implicit casts of unsized array initializers</li>
+  <li>mesa: expose NV_conditional_render on GLES</li>
+  <li>mesa/main: fixup make check after NV_conditional_render for gles</li>
+  <li>Revert "mesa/main: fixup make check after NV_conditional_render for gles"</li>
+  <li>Revert "mesa: expose NV_conditional_render on GLES"</li>
+  <li>mesa/main: correct requirement for EXT_occlusion_query_boolean</li>
+  <li>mesa/main: correct year for EXT_occlusion_query_boolean</li>
+  <li>mesa/main: use non-prefixed enums for consistency</li>
+  <li>mesa/main: simplify pipeline-statistics query validation</li>
+  <li>mesa/main: fix validation of GL_SAMPLES_PASSED</li>
+  <li>mesa/main: fix validation of GL_ANY_SAMPLES_PASSED</li>
+  <li>mesa/main: fix validation of GL_ANY_SAMPLES_PASSED_CONSERVATIVE</li>
+  <li>mesa/main: fix validation of GL_TIME_ELAPSED</li>
+  <li>mesa/main: fix validation of transform-feedback queries</li>
+  <li>mesa/main: fix validation of transform-feedback overflow queries</li>
+  <li>mesa/main: fix validation of ARB_query_buffer_object</li>
+  <li>mesa/main: fix validation of GL_TIMESTAMP</li>
+  <li>mesa/main: remove overly strict query-validation</li>
+  <li>mesa/main: remove ARB suffix from glGetnTexImage</li>
+  <li>mesa/main: remove bogus error for zero-sized images</li>
+  <li>mesa/main: factor out tex-image error-checking</li>
+  <li>mesa/main: factor out common error-checking</li>
+  <li>mesa/main: check cube-completeness in common code</li>
+  <li>mesa/main: fix incorrect depth-error</li>
+  <li>mesa/main: fixup requirements for GL_PRIMITIVES_GENERATED</li>
+  <li>mesa/main: make _mesa_has_tessellation return bool</li>
+  <li>mesa/main: rename format-check function</li>
+  <li>mesa/main: clean up S3_s3tc check</li>
+  <li>mesa/main: clean up OES_texture_float_linear check</li>
+  <li>mesa/main: clean up ES2_compatibility check</li>
+  <li>mesa/main: clean up integer texture check</li>
+  <li>mesa/main: use _mesa_has_FOO_bar for compressed format checks</li>
+  <li>mesa/main: do not allow s3tc enums on gles1</li>
+  <li>mesa/main: do not allow etc2 enums on gles1</li>
+  <li>mesa/main: do not allow astc enums on gles1</li>
+  <li>mesa/main: do not allow depth-texture enums on gles1</li>
+  <li>mesa/main: do not allow stencil-texture enums on gles1</li>
+  <li>mesa/main: do not allow ARB_texture_rgb10_a2ui enums before gles3</li>
+  <li>mesa/main: do not allow integer-texture enums before gles3</li>
+  <li>mesa/main: do not allow ARB_depth_buffer_float enums before gles3</li>
+  <li>mesa/main: do not allow EXT_packed_float enums before gles3</li>
+  <li>mesa/main: do not allow rg-textures enums before gles3</li>
+  <li>mesa/main: do not allow EXT_texture_shared_exponent enums before gles3</li>
+  <li>mesa/main: do not allow MESA_ycbcr_texture enums on gles</li>
+  <li>mesa/main: do not allow type_2_10_10_10_REV enums before gles3</li>
+  <li>mesa/main: do not allow floating-point texture enums on gles1</li>
+  <li>mesa/main: do not allow snorm-texture enums before gles3</li>
+  <li>mesa/main: do not allow sRGB texture enums before gles3</li>
+  <li>mesa/main: do not allow EXT_texture_sRGB_R8 enums before gles3</li>
+  <li>mesa/main: split float-texture support checking in two</li>
+  <li>mesa/main: require EXT_texture_type_2_10_10_10_REV for gles3</li>
+  <li>mesa/main: require EXT_texture_sRGB for gles3</li>
+  <li>mesa/st: do not probe for the same texture-formats twice</li>
+  <li>mesa/main: do not require float-texture filtering for es3</li>
+  <li>mesa/main: correct validation for GL_RGB565</li>
+  <li>mesa/main: fix up _mesa_has_rg_textures for gles2</li>
+  <li>virgl: force linear texturing support</li>
+  <li>virgl: simplify virgl_hw_set_vertex_buffers</li>
+  <li>virgl: simplify virgl_hw_set_index_buffer</li>
+  <li>virgl: wrap vertex element state in a struct</li>
+  <li>virgl: work around bad assumptions in virglrenderer</li>
+  <li>anv/meson: make sure tests link with -msse2</li>
+  <li>anv/autotools: make sure tests link with -msse2</li>
+  <li>docs: add note about sending merge-requests from forks</li>
+  <li>mapi: drop unneeded gl_dispatch_stub declarations</li>
+  <li>virgl: remove unused variable</li>
+</ul>
+
+<p>Ernestas Kulik (2):</p>
+<ul>
+  <li>vc4: Fix leak in HW queries error path</li>
+  <li>v3d: Fix leak in resource setup error path</li>
+</ul>
+
+<p>Francisco Jerez (14):</p>
+<ul>
+  <li>intel/fs: Prevent emission of IR instructions not aligned to their own execution size.</li>
+  <li>intel/fs: Handle source modifiers in lower_integer_multiplication().</li>
+  <li>intel/fs: Implement quad swizzles on ICL+.</li>
+  <li>intel/fs: Fix bug in lower_simd_width while splitting an instruction which was already split.</li>
+  <li>intel/eu/gen7: Fix brw_MOV() with DF destination and strided source.</li>
+  <li>intel/fs: Respect CHV/BXT regioning restrictions in copy propagation pass.</li>
+  <li>intel/fs: Constify fs_inst::can_do_source_mods().</li>
+  <li>intel/fs: Introduce regioning lowering pass.</li>
+  <li>intel/fs: Remove existing lower_conversions pass.</li>
+  <li>intel/fs: Remove nasty open-coded CHV/BXT 64-bit workarounds.</li>
+  <li>intel/fs: Remove FS_OPCODE_UNPACK_HALF_2x16_SPLIT opcodes.</li>
+  <li>intel/fs: Promote execution type to 32-bit when any half-float conversion is needed.</li>
+  <li>intel/fs: Exclude control sources from execution type and region alignment calculations.</li>
+  <li>intel/fs: Implement extended strides greater than 4 for IR source regions.</li>
+</ul>
+
+<p>Fritz Koenig (2):</p>
+<ul>
+  <li>freedreno: drm_fourcc.h header include</li>
+  <li>freedreno: add query for dmabuf modifiers</li>
+</ul>
+
+<p>Gert Wollny (30):</p>
+<ul>
+  <li>mesa/core: Add definitions and translations for EXT_texture_sRGB_R8</li>
+  <li>Gallium: Add format PIPE_FORMAT_R8_SRGB</li>
+  <li>mesa/st: Add support for EXT_texture_sRGB_R8</li>
+  <li>virgl/vtest-winsys: Use virgl version of bind flags</li>
+  <li>r600: Add support for EXT_texture_sRGB_R8</li>
+  <li>mesa: Reference count shaders that are used by transform feedback objects</li>
+  <li>virgl: Add command and flags to initiate debugging on the host (v2)</li>
+  <li>nir: Allow to skip integer ops in nir_lower_to_source_mods</li>
+  <li>i965: Correct L8_UNORM_SRGB table entry</li>
+  <li>i965: be more specific about FBO completeness errors</li>
+  <li>i965: Force zero swizzles for unused components in GL_RED and GL_RG</li>
+  <li>i965: Add support for and expose EXT_texture_sRGB_R8</li>
+  <li>virgl: Use file descriptor instead of un-allocated object</li>
+  <li>i965:use FRAMEBUFFER_UNSUPPORTED instead of FRAMEBUFFER_INCOMPLETE_DIMENSIONS</li>
+  <li>r600: Only set context streamout strides info from the shader that has outputs</li>
+  <li>r600: clean up the GS ring buffers when the context is destroyed</li>
+  <li>glsl: free or reuse memory allocated for TF varying</li>
+  <li>virgl,vtest: Initialize return value</li>
+  <li>virgl: Don't try handling server fences when they are not supported</li>
+  <li>i965: Explicitely handle swizzles for MESA_FORMAT_R_SRGB8</li>
+  <li>i965: Set the FBO error state INCOMPLETE_ATTACHMENT only for SRGB_R8</li>
+  <li>autotools: Deprecate the use of autotools</li>
+  <li>Gallium: Add new CAPS to indicate whether a driver can switch SRGB write</li>
+  <li>virgl: Set sRGB write control CAP based on host capabilities</li>
+  <li>mesa:main: Add flag for EXT_sRGB to gl_extensions</li>
+  <li>i965: Set flag for EXT_sRGB</li>
+  <li>mesa/st: rework support for sRGB framebuffer attachements</li>
+  <li>mesa/main: Use flag for EXT_sRGB instead of EXT_framebuffer_sRGB where possible</li>
+  <li>mesa/main/version: Lower the requirements for GLES 3.0</li>
+  <li>mesa/main: Expose EXT_sRGB_write_control</li>
+</ul>
+
+<p>Guido Günther (2):</p>
+<ul>
+  <li>etnaviv: Make sure rs alignment checks match</li>
+  <li>etnaviv: fix typo in cflush_all description</li>
+</ul>
+
+<p>Gurchetan Singh (18):</p>
+<ul>
+  <li>egl: add missing #include &lt;stddef.h&gt; in egldevice.h</li>
+  <li>virgl: quadruple command buffer size</li>
+  <li>virgl: avoid large inline transfers</li>
+  <li>virgl: don't mark buffers as unclean after a write</li>
+  <li>virgl: texture_transfer_pool --&gt; transfer_pool</li>
+  <li>virgl: remove unnessecary code</li>
+  <li>virgl: move texture metadata to common code</li>
+  <li>virgl: move virgl_resource_layout to common code</li>
+  <li>virgl: move vrend_get_tex_image_offset to common code</li>
+  <li>virgl: store layer_stride in metadata</li>
+  <li>virgl: consolidate transfer code</li>
+  <li>virgl: make transfer code with PIPE_BUFFER targets</li>
+  <li>virgl: make virgl_buffers use resource helpers</li>
+  <li>virgl: modify how we handle GL_MAP_FLUSH_EXPLICIT_BIT</li>
+  <li>virgl: move resource metadata into base resource</li>
+  <li>virgl: move resource creation / import / destruction to common code</li>
+  <li>virgl: don't flush an empty range</li>
+  <li>virgl: remove empty file</li>
+</ul>
+
+<p>Hanno Böck (1):</p>
+<ul>
+  <li>glsl/test: Fix use after free in test_optpass.</li>
+</ul>
+
+<p>Hyunjun Ko (1):</p>
+<ul>
+  <li>freedreno: implements get_sample_position</li>
+</ul>
+
+<p>Iago Toral Quiroga (22):</p>
+<ul>
+  <li>intel/compiler: fix node interference of simd16 instructions</li>
+  <li>nir/constant_folding: fix incorrect bit-size check</li>
+  <li>nir/from_ssa: fix bit-size of temporary register</li>
+  <li>Revert "nir/builder: Assert that intN_t immediates fit"</li>
+  <li>intel/compiler: fix indentation style in opt_algebraic()</li>
+  <li>intel/compiler: fix register allocation in opt_peephole_sel</li>
+  <li>intel/compiler: do not copy-propagate strided regions to ddx/ddy arguments</li>
+  <li>intel/compiler: move nir_lower_bool_to_int32 before nir_lower_locals_to_regs</li>
+  <li>compiler/nir: add a nir_b2f() helper</li>
+  <li>compiler/nir: add nir_fadd_imm() and nir_fmul_imm() helpers</li>
+  <li>compiler/spirv: handle 16-bit float in radians() and degrees()</li>
+  <li>compiler/spirv: implement 16-bit asin</li>
+  <li>compiler/spirv: implement 16-bit acos</li>
+  <li>compiler/spirv: implement 16-bit atan</li>
+  <li>compiler/spirv: implement 16-bit atan2</li>
+  <li>compiler/spirv: implement 16-bit exp and log</li>
+  <li>compiler/spirv: implement 16-bit hyperbolic trigonometric functions</li>
+  <li>compiler/spirv: implement 16-bit frexp</li>
+  <li>compiler/spirv: use 32-bit polynomial approximation for 16-bit asin()</li>
+  <li>anv/pipeline_cache: fix incorrect guards for NIR cache</li>
+  <li>anv/pipeline_cache: free NIR shader cache</li>
+  <li>anv/device: fix maximum number of images supported</li>
+</ul>
+
+<p>Ian Romanick (28):</p>
+<ul>
+  <li>glsl: Add warning tests for identifiers with __</li>
+  <li>glsl: Add pragma to disable all warnings</li>
+  <li>glsl: prevent qualifiers modification of predeclared variables</li>
+  <li>glsl: Omit redundant qualifier checks on redeclarations</li>
+  <li>glsl: Refactor type checking for redeclarations</li>
+  <li>nir: Add a saturated unsigned integer add opcode</li>
+  <li>i965/fs: Implement nir_op_uadd_sat</li>
+  <li>nir/phi_builder: Internal users should use nir_phi_builder_value_set_block_def too</li>
+  <li>util/slab: Rename slab_mempool typed parameters to mempool</li>
+  <li>util/hash_table: Add _mesa_hash_table_init function</li>
+  <li>nir/phi_builder: Use per-value hash table to store [block] -&gt; def mapping</li>
+  <li>nir: Fix holes in nir_instr</li>
+  <li>nir: Release per-block metadata in nir_sweep</li>
+  <li>i965/vec4: Silence unused parameter warnings in vec4 compiler tests</li>
+  <li>i965/vec4/dce: Don't narrow the write mask if the flags are used</li>
+  <li>i965/fs: Eliminate unary op on operand of compare-with-zero</li>
+  <li>i965/vec4: Propagate conditional modifiers from more compares to other compares</li>
+  <li>nir/opt_peephole_select: Don't try to remove flow control around indirect loads</li>
+  <li>intel/compiler: More peephole select</li>
+  <li>nir/opt_peephole_select: Don't peephole_select expensive math instructions</li>
+  <li>intel/compiler: More peephole_select for pre-Gen6</li>
+  <li>Revert "nir/lower_indirect: Bail early if modes == 0"</li>
+  <li>nir/algebraic: Don't put quotes around floating point literals</li>
+  <li>glsl: Add utility to convert text files to C strings</li>
+  <li>nir: Silence zillions of unused parameter warnings in release builds</li>
+  <li>spirv: Add missing break</li>
+  <li>intel/fs: nir_op_extract_i8 extracts a byte, not a word</li>
+  <li>intel/fs: Fix extract_u8 of an odd byte from a 64-bit integer</li>
+</ul>
+
+<p>Ilia Mirkin (37):</p>
+<ul>
+  <li>nv50/ir: delete MINMAX instruction that is no longer in the BB</li>
+  <li>nv50/ir/ra: improve condition for short regs, unify with cond for 16-bit</li>
+  <li>nv50/ir/ra: enforce max register requirement, and change spill order</li>
+  <li>nv50/ir: remove dnz flag when converting MAD to ADD due to optimizations</li>
+  <li>nv50: always keep TSC slot 0 bound</li>
+  <li>nv50,nvc0: add explicit handling of PIPE_CAP_MAX_VERTEX_ELEMENT_SRC_OFFSET</li>
+  <li>nouveau: set texture upload budget</li>
+  <li>nvc0: replace use of explicit default_tsc with entry 0</li>
+  <li>nvc0: always keep TSC slot 0 bound to fix TXF</li>
+  <li>st/mesa: remove sampler associated with buffer texture in pbo logic</li>
+  <li>st/mesa: allow glDrawElements to work with GL_SELECT feedback</li>
+  <li>tgsi: add ATOMFADD operation</li>
+  <li>gallium: add PIPE_CAP_TGSI_ATOMFADD to indicate support</li>
+  <li>st/mesa: select ATOMFADD when source type is float</li>
+  <li>st/mesa: expose GL_NV_shader_atomic_float when ATOMFADD is supported</li>
+  <li>nv50/ir: add support for converting ATOMFADD to proper ir</li>
+  <li>nvc0: enable GL_NV_shader_atomic_float on pre-Maxwell</li>
+  <li>nv50,nvc0: add missing CAPs for unsupported features</li>
+  <li>nv30: avoid setting user_priv without setting cur_ctx</li>
+  <li>nv30: fix rare issue with fp unbinding not finding the bufctx</li>
+  <li>nv30: add support for multi-layer transfers</li>
+  <li>nv30: use correct helper to get blocks in y direction</li>
+  <li>nv30: fix some s3tc layout issues</li>
+  <li>nv30: disable rendering to 3D textures</li>
+  <li>docs: fix gallium screen cap docs</li>
+  <li>nv50,nvc0: mark textures dirty on fb update</li>
+  <li>nvc0: don't put text segment into bufctx</li>
+  <li>nvc0/ir: fix second tex argument after levelZero optimization</li>
+  <li>nv50,nvc0: add explicit settings for recent caps</li>
+  <li>nvc0: add support for handling indirect draws with attrib conversion</li>
+  <li>nvc0/ir: always use CG mode for loads from atomic-only buffers</li>
+  <li>nvc0: fix 3d images on kepler</li>
+  <li>nv50,nvc0: use condition for occlusion queries when already complete</li>
+  <li>nvc0: stick zero values for the compute invocation counts</li>
+  <li>nvc0: we have 16k-sized framebuffers, fix default scissors</li>
+  <li>swr: set PIPE_CAP_MAX_VARYINGS correctly</li>
+  <li>glsl: fix recording of variables for XFB in TCS shaders</li>
+</ul>
+
+<p>Indrajit Das (1):</p>
+<ul>
+  <li>st/va: Return correct status from vlVaQuerySurfaceStatus</li>
+</ul>
+
+<p>Jakob Bornecrantz (1):</p>
+<ul>
+  <li>virgl/vtest: Use default socket name from protocol header</li>
+</ul>
+
+<p>Jan Vesely (2):</p>
+<ul>
+  <li>amd: Make vgpr-spilling depend on llvm version</li>
+  <li>clover: Fix build after clang r348827</li>
+</ul>
+
+<p>Jason Ekstrand (207):</p>
+<ul>
+  <li>vulkan: Update the XML and headers to 1.1.91</li>
+  <li>intel/fs,vec4: Clean up a repeated pattern with SSBOs</li>
+  <li>intel/fs: Use the new nir_src_is_const and friends</li>
+  <li>nir: Add a read_mask helper for ALU instructions</li>
+  <li>intel/vec4: Use the new nir_src_is_const and friends</li>
+  <li>intel/analyze_ubo_ranges: Use nir_src_is_const and friends</li>
+  <li>anv: Use nir_src_is_const and friends in lowering code</li>
+  <li>intel/fs: Add an assert to optimize_frontfacing_ternary</li>
+  <li>nir/lower_alu_to_scalar: Don't try to lower unpack_32_2x16</li>
+  <li>nir/builder: Assert that intN_t immediates fit</li>
+  <li>nir/builder: Add iadd_imm and imul_imm helpers</li>
+  <li>nir/builder: Add a nir_pack/unpack/bitcast helpers</li>
+  <li>nir/spirv: Force 32-bit for UBO and SSBO Booleans</li>
+  <li>nir/glsl: Force 32-bit for UBO and SSBO Booleans</li>
+  <li>nir/lower_io: Add shared to get_io_offset_src</li>
+  <li>nir: Add alignment parameters to SSBO, UBO, and shared access</li>
+  <li>intel/compiler: Lower SSBO and shared loads/stores in NIR</li>
+  <li>intel,nir: Move gl_LocalInvocationID lowering to nir_lower_system_values</li>
+  <li>intel/fs,vec4: Fix a compiler warning</li>
+  <li>vulkan: Update the XML and headers to 1.1.93</li>
+  <li>anv: Expose VK_EXT_scalar_block_layout</li>
+  <li>anv: Put robust buffer access in the pipeline hash</li>
+  <li>anv/nir: Rework arguments to apply_pipeline_layout</li>
+  <li>nir/derefs: Add a nir_derefs_do_not_alias enum value</li>
+  <li>vulkan: Update the XML and headers to 1.1.95</li>
+  <li>nir/opcodes: Pull in the type helpers from constant_expressions</li>
+  <li>nir/opcodes: Rename tbool to tbool32</li>
+  <li>nir/algebraic: Clean up some __str__ cruft</li>
+  <li>nir/algebraic: Refactor codegen a bit</li>
+  <li>nir/algebraic: Add support for unsized conversion opcodes</li>
+  <li>nir/opt_algebraic: Simplify an optimization using the new search ops</li>
+  <li>nir/opt_algebraic: Drop bit-size suffixes from conversions</li>
+  <li>nir/opt_algebraic: Add 32-bit specifiers to a bunch of booleans</li>
+  <li>nir: Make boolean conversions sized just like the others</li>
+  <li>anv,radv: Disable VK_EXT_pci_bus_info</li>
+  <li>intel/ir: Don't allow allocating zero registers</li>
+  <li>spirv: Add support for MinLod</li>
+  <li>nir/lower_tex: Simplify lower_gradient logic</li>
+  <li>nir/lower_tex: Modify txd instructions instead of replacing them</li>
+  <li>nir/lower_tex: Add lowering for some min_lod cases</li>
+  <li>intel/fs: Support min_lod parameters on texture instructions</li>
+  <li>anv: Advertise support for MinLod on Skylake+</li>
+  <li>anv/pipeline: Set the correct binding count for compute shaders</li>
+  <li>intel/blorp: Assert that we don't re-layout a compressed surface</li>
+  <li>nir: Document the function inlining process</li>
+  <li>nir: Allow [iu]mul_high on non-32-bit types</li>
+  <li>nir/lower_int64: Add support for [iu]mul_high</li>
+  <li>nir: Add a pass for lowering integer division by constants</li>
+  <li>i965/vec4: Implement nir_op_uadd_sat</li>
+  <li>i965: Enable nir_opt_idiv_const for 32 and 64-bit integers</li>
+  <li>nir/lower_idiv: Use ilt instead of bit twiddling</li>
+  <li>nir/tgsi: Use nir_bany in ttn_kill_if</li>
+  <li>nir/constant_folding: Fix source bit size logic</li>
+  <li>nir/algebraic: Optimize x2b(xneg(a)) -&gt; a</li>
+  <li>nir: Drop support for lower_b2f</li>
+  <li>nir/algebraic: Make an optimization more specific</li>
+  <li>nir: Rename Boolean-related opcodes to include 32 in the name</li>
+  <li>nir/constant_expressions: Rework Boolean handling</li>
+  <li>nir: Add support for 1-bit data types</li>
+  <li>nir/large_constants: Properly handle 1-bit bools</li>
+  <li>nir/algebraic: Generalize an optimization</li>
+  <li>nir: Add 1-bit Boolean opcodes</li>
+  <li>nir: Add a bool to int32 lowering pass</li>
+  <li>nir: Switch to using 1-bit Booleans for almost everything</li>
+  <li>nir/algebraic: Optimize 1-bit Booleans</li>
+  <li>nir/algebraic: Add some optimizations for D3D-style Booleans</li>
+  <li>radv: Fix a stupid if in gather_intrinsic_info</li>
+  <li>st/nir: Use nir_src_as_uint for tokens</li>
+  <li>vulkan: Update the XML and headers to 1.1.96</li>
+  <li>anv,radv: Re-enable VK_EXT_pci_bus_info</li>
+  <li>anv: Bump the patch version to 96</li>
+  <li>nir/propagate_invariant: Skip unknown vars</li>
+  <li>nir/linking_helpers: Look at derefs for modes</li>
+  <li>nir/lower_io_arrays_to_elements: Look at derefs for modes</li>
+  <li>nir/lower_io_to_scalar: Look at derefs for modes</li>
+  <li>nir/lower_wpos_center: Look at derefs for modes</li>
+  <li>nir/copy_prop_vars: Get modes directly from derefs</li>
+  <li>nir/dead_write_vars: Get modes directly from derefs</li>
+  <li>radv/query: Add a nir_test_flag helper</li>
+  <li>radv/query: Use 1-bit booleans in query shaders</li>
+  <li>intel/blorp: Be more conservative about copying clear colors</li>
+  <li>vulkan: Update the XML and headers to 1.1.97</li>
+  <li>glsl_type: Support serializing 8 and 16-bit types</li>
+  <li>spirv: Handle any bit size in vector_insert/extract</li>
+  <li>anv/apply_pipeline_layout: Set the cursor in lower_res_reindex_intrinsic</li>
+  <li>spirv: Sign-extend array indices</li>
+  <li>spirv: Emit NIR deref instructions on-the-fly</li>
+  <li>nir/builder: Add nir_i2i and nir_u2u helpers which take a bit size</li>
+  <li>spirv: Handle arbitrary bit sizes for deref array indices</li>
+  <li>nir/validate: Require array indices to match the deref bit size</li>
+  <li>nir: Allow storing to shader_storage</li>
+  <li>nir: Distinguish between normal uniforms and UBOs</li>
+  <li>glsl_type: Drop the glsl_get_array_instance C helper</li>
+  <li>glsl_type: Add a C wrapper to get struct field offsets</li>
+  <li>glsl_type: Simplify glsl_channel_type</li>
+  <li>glsl_type: Add support for explicitly laid out matrices and arrays</li>
+  <li>spirv: Propagate layout decorations to created glsl_types</li>
+  <li>nir: Move propagation of cast derefs to a new nir_opt_deref pass</li>
+  <li>nir: Add a ptr_as_array deref type</li>
+  <li>nir/validate: Don't allow derefs in if conditions</li>
+  <li>nir/opt_deref: Properly optimize ptr_as_array derefs</li>
+  <li>nir/deref: Support casts and ptr_as_array in comparisons</li>
+  <li>nir/deref: Skip over casts in fixup_deref_modes</li>
+  <li>nir/remove_dead_variables: Properly handle deref casts</li>
+  <li>nir/validate: Allow derefs in phi nodes</li>
+  <li>nir/intrinsics: Allow deref sources to consume anything</li>
+  <li>nir/intrinsics: Add access flags to load/store_deref</li>
+  <li>nir/validate: Allow array derefs on vectors in more modes</li>
+  <li>nir/lower_io: Add "explicit" IO lowering</li>
+  <li>nir/vulkan: Add a descriptor type to vulkan resource intrinsics</li>
+  <li>spirv: Add error checking for Block and BufferBlock decorations</li>
+  <li>spirv: Choose atomic deref type with pointer_uses_ssa_offset</li>
+  <li>spirv: Add explicit pointer types</li>
+  <li>spirv: Make better use of vtn_pointer_uses_ssa_offset</li>
+  <li>spirv: Add support for using derefs for UBO/SSBO access</li>
+  <li>anv: Enable the new deref-based UBO/SSBO path</li>
+  <li>spirv: Sort supported capabilities</li>
+  <li>anv: Sort properties and features switch statements</li>
+  <li>nir: Add some more int64 lowering helpers</li>
+  <li>anv/pipeline: Constant fold after apply_pipeline_layout</li>
+  <li>anv/pipeline: Move wpos and input attachment lowering to lower_nir</li>
+  <li>compiler/types: Serialize/deserialize subpass input types correctly</li>
+  <li>anv/pipeline: Hash shader modules and spec constants separately</li>
+  <li>anv/pipeline_cache: Add support for caching NIR</li>
+  <li>anv/pipeline: Cache the pre-lowered NIR</li>
+  <li>intel/peephole_ffma: Fix swizzle propagation</li>
+  <li>spirv: Whack sampler/image pointers to uniform</li>
+  <li>spirv: Contain the GLSLang issue #179 workaround to old GLSLang</li>
+  <li>intel/nir: Call nir_opt_deref in brw_nir_optimize</li>
+  <li>nir/gcm: Support deref instructions</li>
+  <li>spirv: Emit switch conditions on-the-fly</li>
+  <li>intel/blorp: Add two more filter modes</li>
+  <li>anv: Rename has_resolve to has_color_resolve</li>
+  <li>anv/blorp: Refactor MSAA resolves into an exportable helper function</li>
+  <li>anv: Move resolve_subpass to genX_cmd_buffer.c</li>
+  <li>anv: Implement VK_KHR_depth_stencil_resolve</li>
+  <li>nir: Add a bool to float32 lowering pass</li>
+  <li>intel/eu: Stop overriding exec sizes in send_indirect_message</li>
+  <li>intel/fs: Don't touch accumulator destination while applying regioning alignment rule</li>
+  <li>anv: Re-sort the extensions list</li>
+  <li>anv: Only parse pImmutableSamplers if the descriptor has samplers</li>
+  <li>relnotes: Add newly added Vulkan extensions</li>
+  <li>anv/pipeline: Add a pdevice helper variable</li>
+  <li>nir: Mark deref UBO and SSBO access as non-scalar</li>
+  <li>spirv: Update the JSON and headers from Khronos master</li>
+  <li>anv: Always emit at least one vertex element</li>
+  <li>spirv: Initialize struct member offsets to -1</li>
+  <li>spirv: Only split blocks</li>
+  <li>spirv: Only set interface_type on blocks</li>
+  <li>nir: Preserve offsets in lower_io_to_scalar_early</li>
+  <li>nir/xfb: Fix offset accounting for dvec3/4</li>
+  <li>nir/xfb: Properly handle arrays of blocks</li>
+  <li>anv: Add but do not enable VK_EXT_transform_feedback</li>
+  <li>anv: Add pipeline cache support for xfb_info</li>
+  <li>anv: Implement the basic form of VK_EXT_transform_feedback</li>
+  <li>anv: Implement vkCmdDrawIndirectByteCountEXT</li>
+  <li>anv: Implement CmdBegin/EndQueryIndexed</li>
+  <li>genxml: Add SO_PRIM_STORAGE_NEEDED and SO_NUM_PRIMS_WRITTEN</li>
+  <li>anv: Implement transform feedback queries</li>
+  <li>nir: Add load/store/atomic global intrinsics</li>
+  <li>nir/lower_io: Add a 32 and 64-bit global address formats</li>
+  <li>nir/lower_io: Add support for nir_var_mem_global</li>
+  <li>nir/validate: Allow array derefs of vectors for nir_var_mem_global</li>
+  <li>nir: Allow SSBOs and global to alias</li>
+  <li>spirv: Drop a bogus assert</li>
+  <li>spirv: Handle OpTypeForwardPointer</li>
+  <li>spirv: Implement OpConvertPtrToU and OpConvertUToPtr</li>
+  <li>spirv: Add support for SPV_EXT_physical_storage_buffer</li>
+  <li>intel/fs: Get rid of fs_inst::equals</li>
+  <li>intel/defines: Explicitly cast to uint32_t in SET_FIELD and SET_BITS</li>
+  <li>intel/fs: Handle IMAGE_SIZE in size_read() and is_send_from_grf()</li>
+  <li>intel/fs: Take an explicit exec size in brw_surface_payload_size()</li>
+  <li>intel/eu: Add has_simd4x2 bools to surface_write functions</li>
+  <li>intel/eu: Rework surface descriptor helpers</li>
+  <li>intel/fs: Add a generic SEND opcode</li>
+  <li>intel/fs: Use SHADER_OPCODE_SEND for surface messages</li>
+  <li>intel/fs: Use a logical opcode for IMAGE_SIZE</li>
+  <li>intel/fs: Use SHADER_OPCODE_SEND for texturing on gen7+</li>
+  <li>intel/fs: Use SHADER_OPCODE_SEND for varying UBO pulls on gen7+</li>
+  <li>intel/eu: Use GET_BITS in brw_inst_set_send_ex_desc</li>
+  <li>intel/eu/validate: SEND restrictions also apply to SENDC</li>
+  <li>intel/eu: Add more message descriptor helpers</li>
+  <li>intel/disasm: Rework SEND decoding to use descriptors</li>
+  <li>intel/inst: Fix the ia16_addr_imm helpers</li>
+  <li>intel/inst: Indent some code</li>
+  <li>intel/eu: Add support for the SENDS[C] messages</li>
+  <li>intel/disasm: Properly disassemble split sends</li>
+  <li>intel/fs: Support SENDS in SHADER_OPCODE_SEND</li>
+  <li>intel/fs: Add interference between SENDS sources</li>
+  <li>intel/fs: Use split sends for surface writes on gen9+</li>
+  <li>intel/fs: Do the grf127 hack on SIMD8 instructions in SIMD16 mode</li>
+  <li>nir/deref: Rematerialize parents in rematerialize_derefs_in_use_blocks</li>
+  <li>intel/fs: Bail in optimize_extract_to_float if we have modifiers</li>
+  <li>compiler/types: Add a contains_64bit helper</li>
+  <li>nir/xfb: Properly align 64-bit values</li>
+  <li>nir: Rewrite lower_clip_cull_distance_arrays to do a lot less lowering</li>
+  <li>nir/xfb: Work in terms of components rather than slots</li>
+  <li>nir/xfb: Handle compact arrays in gather_xfb_info</li>
+  <li>nir/lower_clip_cull: Fix an incorrect assert</li>
+  <li>anv: Count surfaces for non-YCbCr images in GetDescriptorSetLayoutSupport</li>
+  <li>spirv: OpImageQueryLod requires a sampler</li>
+  <li>intel,nir: Lower TXD with min_lod when the sampler index is not &lt; 16</li>
+  <li>spirv: Pull offset/stride from the pointer for OpArrayLength</li>
+  <li>anv: Refactor descriptor pushing a bit</li>
+  <li>anv: Take references to push descriptor set layouts</li>
+  <li>nir: Add a pass for lowering IO back to vector when possible</li>
+  <li>intel/nir: Vectorize all IO</li>
+</ul>
+
+<p>Jiang, Sonny (1):</p>
+<ul>
+  <li>radeonsi: add compute_last_block to configure the partial block fields</li>
+</ul>
+
+<p>Jon Turney (3):</p>
+<ul>
+  <li>glx: Fix compilation with GLX_USE_WINDOWSGL</li>
+  <li>appveyor: put build steps in a script, rather than inline in appveyor.yml</li>
+  <li>appveyor: Add a Cygwin build script</li>
+</ul>
+
+<p>Jonathan Marek (42):</p>
+<ul>
+  <li>nir: add fceil lowering</li>
+  <li>freedreno: a2xx: fd2_draw update</li>
+  <li>freedreno/a2xx: fix POINT_MINMAX_MAX overflow</li>
+  <li>freedreno: add missing a20x ids</li>
+  <li>freedreno/a2xx: set VIZ_QUERY_ID on a20x</li>
+  <li>freedreno/a2xx: Compute depth base in gmem correctly</li>
+  <li>freedreno: a2xx texture update</li>
+  <li>freedreno: use GENERIC instead of TEXCOORD for blit program</li>
+  <li>freedreno: use MSM_BO_SCANOUT with scanout buffers</li>
+  <li>glsl/nir: int constants as float for native_integers=false</li>
+  <li>glsl/nir: ftrunc for native_integers=false float to int cast</li>
+  <li>glsl/nir: keep bool types when native_integers=false</li>
+  <li>freedreno: a2xx: cleanup init_shader_const</li>
+  <li>freedreno: a2xx: cleanup REG_A2XX_PA_CL_VTE_CNTL</li>
+  <li>freedreno: a2xx: fix gmem2mem viewport</li>
+  <li>freedreno: a2xx: fix VERTEX_REUSE/DEALLOC on a20x</li>
+  <li>freedreno: a2xx: fix non-zero texture base offsets</li>
+  <li>freedreno: a2xx: sysmem rendering</li>
+  <li>freedreno: a2xx: NIR backend</li>
+  <li>freedreno: a2xx: insert scalar MOV to allow 2 source scalar</li>
+  <li>freedreno: a2xx: add ir2 copy propagation</li>
+  <li>freedreno: a2xx: add partial lower_scalar pass for ir2</li>
+  <li>freedreno: add renderonly scanout</li>
+  <li>freedreno: a2xx: ir2 cleanup</li>
+  <li>freedreno: a2xx: enable early-Z testing</li>
+  <li>freedreno: update a2xx registers</li>
+  <li>freedreno: a2xx: a20x hw binning</li>
+  <li>freedreno: a2xx: clear fixes and fast clear path</li>
+  <li>freedreno: a2xx: minor solid_vertexbuf fixups</li>
+  <li>freedreno: a2xx: add perfcntrs</li>
+  <li>kmsro: Add freedreno renderonly support</li>
+  <li>st/dri: invalidate_resource depth/stencil before flush_resource</li>
+  <li>mesa/st: wire up DiscardFramebuffer</li>
+  <li>freedreno: fix invalidate logic</li>
+  <li>freedreno: fix depth usage logic</li>
+  <li>freedreno: fix sysmem rendering being used when clear is used</li>
+  <li>freedreno: a2xx: fix fast clear</li>
+  <li>freedreno: a2xx: don't write 4th vertex in mem2gmem</li>
+  <li>freedreno: a2xx: add use_hw_binning function</li>
+  <li>freedreno: a2xx: fix fast clear for some gmem configurations</li>
+  <li>freedreno: a2xx: fix mipmapping for NPOT textures</li>
+  <li>freedreno: use renderonly path for buffers allocated with modifiers</li>
+</ul>
+
+<p>Jordan Justen (3):</p>
+<ul>
+  <li>docs: Document GitLab merge request process (email alternative)</li>
+  <li>i965/genX_state: Add register access functions</li>
+  <li>i965/compute: Emit GPGPU_WALKER in genX_state_upload</li>
+</ul>
+
+<p>Jose Maria Casanova Crespo (1):</p>
+<ul>
+  <li>glsl: TCS outputs can not be transform feedback candidates on GLES</li>
+</ul>
+
+<p>José Fonseca (2):</p>
+<ul>
+  <li>appveyor: Revert commits adding Cygwin support.</li>
+  <li>scons: Workaround failures with MSVC when using SCons 3.0.[2-4].</li>
+</ul>
+
+<p>Juan A. Suarez Romero (17):</p>
+<ul>
+  <li>docs: add release notes for 18.2.5</li>
+  <li>docs: add sha256 checksums for 18.2.5</li>
+  <li>docs: update calendar, add news item and link release notes for 18.2.5</li>
+  <li>docs: add release notes for 18.2.6</li>
+  <li>docs: add sha256 checksums for 18.2.6</li>
+  <li>docs: update calendar, add news item and link release notes for 18.2.6</li>
+  <li>docs: extends 18.2 lifecycle</li>
+  <li>docs: add release notes for 18.2.7</li>
+  <li>docs: add sha256 checksums for 18.2.7</li>
+  <li>docs: update calendar, add news item and link release notes for 18.2.7</li>
+  <li>docs: add release notes for 18.2.8</li>
+  <li>docs: add sha256 checksums for 18.2.8</li>
+  <li>docs: update calendar, add news item and link release notes for 18.2.8</li>
+  <li>anv/cmd_buffer: check for NULL framebuffer</li>
+  <li>genxml: add missing field values for 3DSTATE_SF</li>
+  <li>anv: advertise 8 subpixel precision bits</li>
+  <li>anv: destroy descriptor sets when pool gets reset</li>
+</ul>
+
+<p>Józef Kucia (1):</p>
+<ul>
+  <li>nir: Fix assert in print_intrinsic_instr().</li>
+</ul>
+
+<p>Karol Herbst (35):</p>
+<ul>
+  <li>nv50/ir: print color masks of tex instructions</li>
+  <li>nv50/ra: add condenseDef overloads for partial condenses</li>
+  <li>nv50/ir: add scalar field to TexInstructions</li>
+  <li>gm107/ir: use scalar tex instructions where possible</li>
+  <li>gm107/ir: fix compile time warning in getTEXSMask</li>
+  <li>nir: add const_index parameters to system value builder function</li>
+  <li>nir: replace nir_load_system_value calls with appropiate builder functions</li>
+  <li>nir/spirv: cast shift operand to u32</li>
+  <li>nv50,nvc0: Fix gallium nine regression regarding sampler bindings</li>
+  <li>nv50/ir: initialize relDegree staticly</li>
+  <li>nouveau: use atomic operations for driver statistics</li>
+  <li>nv50/ir: fix use-after-free in ConstantFolding::visit</li>
+  <li>nir: rename global/local to private/function memory</li>
+  <li>nv50/ir: disable tryCollapseChainedMULs in ConstantFolding for precise instructions</li>
+  <li>gm107/ir: disable TEXS for tex with derivAll set</li>
+  <li>nir: rename nir_var_private to nir_var_shader_temp</li>
+  <li>nir: rename nir_var_function to nir_var_function_temp</li>
+  <li>nir: rename nir_var_ubo to nir_var_mem_ubo</li>
+  <li>nir: rename nir_var_ssbo to nir_var_mem_ssbo</li>
+  <li>nir: rename nir_var_shared to nir_var_mem_shared</li>
+  <li>nir/spirv: handle SpvStorageClassCrossWorkgroup</li>
+  <li>glsl/lower_output_reads: set invariant and precise flags on temporaries</li>
+  <li>nir: replace more nir_load_system_value calls with builder functions</li>
+  <li>nir/validate: allow to check against a bitmask of bit_sizes</li>
+  <li>nir: add legal bit_sizes to intrinsics</li>
+  <li>nir: add bit_size parameter to system values with multiple allowed bit sizes</li>
+  <li>mesa: add MESA_SHADER_KERNEL</li>
+  <li>vtn: handle SpvExecutionModelKernel</li>
+  <li>nir/spirv: handle ContractionOff execution mode</li>
+  <li>gk104/ir: Use the new rcp/rsq in library</li>
+  <li>gm107/ir: add fp64 rcp</li>
+  <li>gm107/ir: add fp64 rsq</li>
+  <li>gallium: add PIPE_CAP_MAX_VARYINGS</li>
+  <li>st/mesa: require RGBA2, RGB4, and RGBA4 to be renderable</li>
+  <li>nir/opt_if: don't mark progress if nothing changes</li>
+</ul>
+
+<p>Kenneth Graunke (41):</p>
+<ul>
+  <li>intel: Use a URB start offset of 0 for disabled stages.</li>
+  <li>st/mesa: Pull nir_lower_wpos_ytransform work into a helper function.</li>
+  <li>st/nir: Drop unused parameter from st_nir_assign_uniform_locations().</li>
+  <li>st/mesa: Don't record garbage streamout information in the non-SSO case.</li>
+  <li>i915: Delete swizzling detection logic.</li>
+  <li>nir: Use nir_shader_get_entrypoint in nir_lower_clip_vs().</li>
+  <li>nir: Inline lower_clip_vs() into nir_lower_clip_vs().</li>
+  <li>nir: Save nir_variable pointers in nir_lower_clip_vs rather than locs.</li>
+  <li>nir: Make nir_lower_clip_vs optionally work with variables.</li>
+  <li>i965: Allow only one slot of clip distances to be set on Gen4-5.</li>
+  <li>i965: Use a 'nir' temporary rather than poking at brw_program</li>
+  <li>i965: Do NIR shader cloning in the caller.</li>
+  <li>intel/compiler: Use nir's info when checking uses_streams.</li>
+  <li>intel/blorp: Expand blorp_address::offset to be 64 bits.</li>
+  <li>i965: Delete dead brw_meta_resolve_color prototype.</li>
+  <li>i965: Flip arguments to load_register_reg helpers.</li>
+  <li>genxml: Consistently use a numeric "MOCS" field</li>
+  <li>i965: Don't override subslice count to 4 on Gen11.</li>
+  <li>st/mesa: Drop dead 'passthrough_fs' field.</li>
+  <li>st/mesa: Drop !passColor optimization in drawpixels shaders.</li>
+  <li>st/mesa: Don't open code the drawpixels vertex shader.</li>
+  <li>st/mesa: Combine the DrawPixels and Bitmap passthrough VS programs.</li>
+  <li>st/nir: Gather info after applying lowering FS variant features</li>
+  <li>st/nir: Drop unused gl_program parameter in VS input handling helper.</li>
+  <li>nir: Fix gl_nir_lower_samplers_as_deref's structure type handling.</li>
+  <li>nir: Make gl_nir_lower_samplers use gl_nir_lower_samplers_as_deref</li>
+  <li>blorp: Add blorp_get_surface_address to the driver interface.</li>
+  <li>blorp: Pass the batch to lookup/upload_shader instead of context</li>
+  <li>nir: Allow a non-existent sampler deref in nir_lower_samplers_as_deref</li>
+  <li>st/nir: Lower TES gl_PatchVerticesIn to a constant if linked with a TCS.</li>
+  <li>i965: Drop mark_surface_used mechanism.</li>
+  <li>st/mesa: Make an enum for pipeline statistics query result indices.</li>
+  <li>st/mesa: Rearrange PIPE_QUERY_PIPELINE_STATISTICS result fetching.</li>
+  <li>gallium: Add the ability to query a single pipeline statistics counter</li>
+  <li>st/mesa: Optionally override RGB/RGBX dst alpha blend factors</li>
+  <li>gallium: Add forgotten docs for PIPE_CAP_GLSL_TESS_LEVELS_AS_INPUTS.</li>
+  <li>st/mesa: Limit GL_MAX_[NATIVE_]PROGRAM_PARAMETERS_ARB to 2048</li>
+  <li>anv: Put MOCS in the correct location</li>
+  <li>nir: Don't reassociate add/mul chains containing only constants</li>
+  <li>compiler: Mark clip/cull distance arrays as compact before lowering.</li>
+  <li>spirv: Eliminate dead input/output variables after translation.</li>
+</ul>
+
+<p>Kirill Burtsev (1):</p>
+<ul>
+  <li>loader: free error state, when checking the drawable type</li>
+</ul>
+
+<p>Kristian H. Kristensen (14):</p>
+<ul>
+  <li>freedreno/a6xx: Clear z32 and separate stencil with blitter</li>
+  <li>freedreno/a6xx: Move restore blits to IB</li>
+  <li>freedreno/a6xx: Move resolve blits to an IB</li>
+  <li>freedreno/a6xx: Clear gmem buffers at flush time</li>
+  <li>gallium: Android build fixes</li>
+  <li>mesa: Add core support for EXT_multisampled_render_to_texture{,2}</li>
+  <li>gallium: Add new PIPE_CAP_SURFACE_SAMPLE_COUNT</li>
+  <li>st/mesa: Add support for EXT_multisampled_render_to_texture</li>
+  <li>freedreno: Add support for EXT_multisampled_render_to_texture</li>
+  <li>freedreno: Fix the Makefile.am fix</li>
+  <li>glapi: fixup EXT_multisampled_render_to_texture dispatch</li>
+  <li>freedreno: Synchronize batch and flush for staging resource</li>
+  <li>freedreno/a6xx: Turn on texture tiling by default</li>
+  <li>freedreno/a6xx: Emit blitter dst with OUT_RELOCW</li>
+</ul>
+
+<p>Leo Liu (2):</p>
+<ul>
+  <li>st/va: fix the incorrect max profiles report</li>
+  <li>st/va/vp9: set max reference as default of VP9 reference number</li>
+</ul>
+
+<p>Lionel Landwerlin (47):</p>
+<ul>
+  <li>intel/dump_gpu: add missing gdb option</li>
+  <li>intel/sanitize_gpu: add help/gdb options to wrapper</li>
+  <li>intel/sanitize_gpu: deal with non page multiple buffer sizes</li>
+  <li>intel/sanitize_gpu: add debug message on mmap fail</li>
+  <li>intel/decoders: fix instruction base address parsing</li>
+  <li>anv: stub internal android code</li>
+  <li>anv/android: mark gralloc allocated BOs as external</li>
+  <li>intel/dump_gpu: move output option together</li>
+  <li>intel/dump_gpu: add platform option</li>
+  <li>intel/aub_read: remove useless breaks</li>
+  <li>nir/lower_tex: add alpha channel parameter for yuv lowering</li>
+  <li>nir/lower_tex: Add AYUV lowering support</li>
+  <li>dri: add AYUV format</li>
+  <li>i965: add support for sampling from AYUV</li>
+  <li>anv: simplify internal address offset</li>
+  <li>anv/image: remove unused parameter</li>
+  <li>anv/lower_ycbcr: make sure to set 0s on all components</li>
+  <li>anv: associate vulkan formats with aspects</li>
+  <li>anv: use image aspects rather than computed ones</li>
+  <li>anv: move helper function internally</li>
+  <li>egl/dri: fix error value with unknown drm format</li>
+  <li>intel/decoders: read ring buffer length</li>
+  <li>intel/aubinator: fix ring buffer pointer</li>
+  <li>intel/aub_viewer: fix dynamic state printing</li>
+  <li>intel/aub_viewer: Print blend states properly</li>
+  <li>anv: flush pipeline before query result copies</li>
+  <li>anv/query: flush render target before copying results</li>
+  <li>anv: don't do partial resolve on layer &gt; 0</li>
+  <li>intel/aub_viewer: fix shader get_bo</li>
+  <li>intel/aub_viewer: fixup 0x address prefix</li>
+  <li>intel/aub_viewer: print address of missing shader</li>
+  <li>intel/aub_viewer: fix shader view</li>
+  <li>intel/aub_viewer: fold binding/sampler table items</li>
+  <li>intel/aub_viewer: highlight true booleans</li>
+  <li>i965: limit VF caching workaround to gen8/9/10</li>
+  <li>intel/blorp: emit VF caching workaround before 3DSTATE_VERTEX_BUFFERS</li>
+  <li>i965: include draw_params/derived_draw_params for VF cache workaround</li>
+  <li>i965: add CS stall on VF invalidation workaround</li>
+  <li>anv: explictly specify format for blorp ccs/mcs op</li>
+  <li>anv: flush fast clear colors into compressed surfaces</li>
+  <li>anv: fix invalid binding table index computation</li>
+  <li>anv: narrow flushing of the render target to buffer writes</li>
+  <li>anv: document cache flushes &amp; invalidations</li>
+  <li>intel/genxml: add missing MI_PREDICATE compare operations</li>
+  <li>vulkan: make generated enum to strings helpers available from c++</li>
+  <li>intel: fix urb size for CFL GT1</li>
+  <li>intel/compiler: use correct swizzle for replacement</li>
+</ul>
+
+<p>Lucas Stach (6):</p>
+<ul>
+  <li>etnaviv: use dummy RT buffer when rendering without color buffer</li>
+  <li>etnaviv: use surface format directly</li>
+  <li>st/dri: allow both render and sampler compatible dma-buf formats</li>
+  <li>st/dri: replace format conversion functions with single mapping table</li>
+  <li>etnaviv: enable full overwrite in a few more cases</li>
+  <li>etnaviv: annotate variables only used in debug build</li>
+</ul>
+
+<p>Marek Olšák (94):</p>
+<ul>
+  <li>st/va: fix incorrect use of resource_destroy</li>
+  <li>ac/surface: remove the overallocation workaround for Vega12</li>
+  <li>radeonsi: use better DCC clear codes</li>
+  <li>radeonsi: don't set the CB clear color registers for 0/1 clear colors on Raven2</li>
+  <li>gallium: add PIPE_CONTEXT_LOSE_CONTEXT_ON_RESET</li>
+  <li>radeonsi: stop command submission with PIPE_CONTEXT_LOSE_CONTEXT_ON_RESET only</li>
+  <li>st/mesa: disable L3 thread pinning</li>
+  <li>mesa: mark GL_SR8_EXT non-renderable on GLES</li>
+  <li>radeonsi: fix video APIs on Raven2</li>
+  <li>gallium/u_tests: add a compute shader test that clears an image</li>
+  <li>gallium/u_tests: fix MSVC build by using old-style zero initializers</li>
+  <li>mesa/glthread: pass the function name to _mesa_glthread_restore_dispatch</li>
+  <li>mesa/glthread: enable immediate mode</li>
+  <li>drirc: enable glthread for Talos Principle</li>
+  <li>st/mesa: regularly re-pin driver threads to the CCX where the app thread is</li>
+  <li>st/mesa: pin driver threads to a fixed CCX when glthread is enabled</li>
+  <li>radeonsi: don't send data after write-confirm with BOTTOM_OF_PIPE_TS</li>
+  <li>radeonsi: go back to using bottom-of-pipe for beginning of TIME_ELAPSED</li>
+  <li>winsys/amdgpu: fix a buffer leak in amdgpu_bo_from_handle</li>
+  <li>winsys/amdgpu: fix a device handle leak in amdgpu_winsys_create</li>
+  <li>radeonsi: clean up primitive binning enablement</li>
+  <li>radeonsi: use structured buffer intrinsics for image views</li>
+  <li>radeonsi: fix is_oneway_access_only for image stores</li>
+  <li>radeonsi: small cleanup for memory opcodes</li>
+  <li>tgsi/scan: add more information about bindless usage</li>
+  <li>radeonsi/nir: parse more information about bindless usage</li>
+  <li>radeonsi: fix is_oneway_access_only for bindless images</li>
+  <li>winsys/amdgpu: always reclaim/release slabs if there is not enough memory</li>
+  <li>radeonsi: generalize the slab allocator code to allow layered slab allocators</li>
+  <li>winsys/amdgpu: optimize slab allocation for 2 MB amdgpu page tables</li>
+  <li>winsys/amdgpu: clean up code around BO VM alignment</li>
+  <li>winsys/amdgpu: use &gt;= instead of &gt; for VM address alignment</li>
+  <li>winsys/amdgpu: increase the VM alignment to the MSB of the size for Gfx9</li>
+  <li>winsys/amdgpu: overallocate buffers for faster address translation on Gfx9</li>
+  <li>winsys/amdgpu,radeon: pass vm_alignment to buffer_from_handle</li>
+  <li>winsys/amdgpu: use optimal VM alignment for imported buffers</li>
+  <li>winsys/amdgpu: use optimal VM alignment for CPU allocations</li>
+  <li>radeonsi: allow si_cp_dma_clear_buffer to clear GDS from any IB</li>
+  <li>winsys/amdgpu: add support for allocating GDS and OA resources</li>
+  <li>radeonsi: add memory management stress tests for GDS</li>
+  <li>Revert "winsys/amdgpu: overallocate buffers for faster address translation on Gfx9"</li>
+  <li>st/mesa: expose GL_OES_texture_view</li>
+  <li>mesa: expose GL_EXT_texture_view as an alias of GL_OES_texture_view</li>
+  <li>mesa: expose EXT_texture_compression_rgtc on GLES</li>
+  <li>mesa: expose EXT_texture_compression_bptc in GLES</li>
+  <li>mesa: expose AMD_texture_texture4</li>
+  <li>st/mesa: expose EXT_render_snorm on GLES</li>
+  <li>radeonsi: don't emit redundant PKT3_NUM_INSTANCES packets</li>
+  <li>radeonsi: call si_fix_resource_usage for the GS copy shader as well</li>
+  <li>radeonsi: make si_cp_wait_mem more configurable</li>
+  <li>radeonsi: use u_decomposed_prims_for_vertices instead of u_prims_for_vertices</li>
+  <li>radeonsi: remove unused variables in si_insert_input_ptr</li>
+  <li>radeonsi: always unmap texture CPU mappings on 32-bit CPU architectures</li>
+  <li>ac: remove unused variable from ac_build_ddxy</li>
+  <li>st/mesa: unify window-system renderbuffer initialization</li>
+  <li>st/mesa: don't reference pipe_surface locally in PBO code</li>
+  <li>st/mesa: don't leak pipe_surface if pipe_context is not current</li>
+  <li>st/dri: fix dri2_format_table for argb1555 and rgb565</li>
+  <li>radeonsi: also apply the GS hang workaround to draws without tessellation</li>
+  <li>winsys/amdgpu: fix whitespace</li>
+  <li>winsys/amdgpu: use the new BO list API</li>
+  <li>radeonsi: fix a u_blitter crash after a shader with FBFETCH</li>
+  <li>radeonsi: fix rendering to tiny viewports where the viewport center is &gt; 8K</li>
+  <li>radeonsi: use buffer_store_format_x &amp; xy</li>
+  <li>radeonsi: remove redundant call to emit_cache_flush in compute clear/copy</li>
+  <li>radeonsi: compile clear and copy buffer compute shaders on demand</li>
+  <li>radeonsi: correct WRITE_DATA.DST_SEL definitions</li>
+  <li>radeonsi: fix the top-of-pipe fence on SI</li>
+  <li>radeonsi: don't use WRITE_DATA.DST_SEL == MEM_GRBM on &gt;= CIK</li>
+  <li>radeonsi: move PKT3_WRITE_DATA generation into a helper function</li>
+  <li>gallium: add SINT formats to have exact counterparts to SNORM formats</li>
+  <li>gallium/util: add util_format_snorm8_to_sint8 (from radeonsi)</li>
+  <li>radeonsi: disable render cond &amp; pipeline stats for internal compute dispatches</li>
+  <li>radeonsi: rename rscreen -&gt; sscreen</li>
+  <li>radeonsi: rename rview -&gt; sview</li>
+  <li>winsys/amdgpu: rename rfence, rsrc, rdst -&gt; afence, asrc, adst</li>
+  <li>radeonsi: remove r600 from comments</li>
+  <li>radeonsi: rename r600_resource -&gt; si_resource</li>
+  <li>radeonsi: rename rquery -&gt; squery</li>
+  <li>radeonsi: rename rsrc -&gt; ssrc, rdst -&gt; sdst</li>
+  <li>radeonsi: rename rbo, rbuffer to buf or buffer</li>
+  <li>radeonsi: rename rfence -&gt; sfence</li>
+  <li>st/mesa: purge framebuffers when unbinding a context</li>
+  <li>st/mesa: fix PRIMITIVES_GENERATED query after the "pipeline stat single" changes</li>
+  <li>ac: use the correct LLVM processor name on Raven2</li>
+  <li>radeonsi: fix crashing performance counters (division by zero)</li>
+  <li>meson: drop the xcb-xrandr version requirement</li>
+  <li>gallium/u_threaded: fix EXPLICIT_FLUSH for flush offsets &gt; 0</li>
+  <li>radeonsi: fix EXPLICIT_FLUSH for flush offsets &gt; 0</li>
+  <li>winsys/amdgpu: don't drop manually added fence dependencies</li>
+  <li>radeonsi: add driconf option radeonsi_enable_nir</li>
+  <li>radeonsi: always enable NIR for Civilization 6 to fix corruption</li>
+  <li>driconf: add Civ6Sub executable for Civilization 6</li>
+  <li>tgsi: don't set tgsi_info::uses_bindless_images for constbufs and hw atomics</li>
+</ul>
+
+<p>Mario Kleiner (4):</p>
+<ul>
+  <li>radeonsi: Fix use of 1- or 2- component GL_DOUBLE vbo's.</li>
+  <li>egl/wayland: Allow client-&gt;server format conversion for PRIME offload. (v2)</li>
+  <li>egl/wayland-drm: Only announce formats via wl_drm which the driver supports.</li>
+  <li>drirc: Add sddm-greeter to adaptive_sync blacklist.</li>
+</ul>
+
+<p>Mark Janes (3):</p>
+<ul>
+  <li>Revert "i965/batch: avoid reverting batch buffer if saved state is an empty"</li>
+  <li>Revert "Implementation of egl dri2 drivers for MESA_query_driver"</li>
+  <li>Revert "Implement EGL API for MESA_query_driver"</li>
+</ul>
+
+<p>Mathias Fröhlich (17):</p>
+<ul>
+  <li>mesa: Remove needless indirection in some draw functions.</li>
+  <li>mesa: Rename gl_vertex_array_object::_Enabled -&gt; Enabled.</li>
+  <li>mesa: Use the gl_vertex_array_object::Enabled bitfield.</li>
+  <li>mesa: Use gl_vertex_array_object::Enabled for glGet.</li>
+  <li>mesa: Remove gl_array_attributes::Enabled.</li>
+  <li>mesa: Work with bitmasks when en/dis-abling VAO arrays.</li>
+  <li>mesa: Unify glEdgeFlagPointer data type.</li>
+  <li>nouveau: Use gl_array_attribute::_ElementSize.</li>
+  <li>tnl: Use gl_array_attribute::_ElementSize.</li>
+  <li>mesa: Factor out struct gl_vertex_format.</li>
+  <li>mesa: Remove unneeded bitfield widths from the VAO.</li>
+  <li>mesa/st: Only care about the uploader if it was used.</li>
+  <li>mesa/st: Only unmap the uploader that was actually used.</li>
+  <li>mesa/st: Factor out array and buffer setup from st_atom_array.c.</li>
+  <li>mesa/st: Avoid extra references in the feedback draw function scope.</li>
+  <li>mesa/st: Use binding information from the VAO in feedback rendering.</li>
+  <li>mesa/st: Make st_pipe_vertex_format static.</li>
+</ul>
+
+<p>Matt Turner (41):</p>
+<ul>
+  <li>util/ralloc: Switch from DEBUG to NDEBUG</li>
+  <li>util/ralloc: Make sizeof(linear_header) a multiple of 8</li>
+  <li>nir: Call fflush() at the end of nir_print_shader()</li>
+  <li>glsl: Remove unused member variable</li>
+  <li>gallivm: Use nextafterf(0.5, 0.0) as rounding constant</li>
+  <li>mesa: Revert INTEL_fragment_shader_ordering support</li>
+  <li>Revert "st/mesa: silenced unhanded enum warning in st_glsl_to_tgsi.cpp"</li>
+  <li>i965/fs: Handle V/UV immediates in dump_instructions()</li>
+  <li>glsl: Add function support to glsl_to_nir</li>
+  <li>glsl: Create file to contain software fp64 functions</li>
+  <li>glsl: Add "built-in" functions to do ffma(fp64)</li>
+  <li>glsl: Add "built-in" functions to do fmin/fmax(fp64)</li>
+  <li>glsl: Add "built-in" function to do ffloor(fp64)</li>
+  <li>glsl: Add "built-in" functions to do ffract(fp64)</li>
+  <li>glsl: Add "built-in" functions to convert bool to double</li>
+  <li>nir: Rework nir_lower_constant_initializers() to handle functions</li>
+  <li>nir: Tag entrypoint for easy recognition by nir_shader_get_entrypoint()</li>
+  <li>nir: Wire up int64 lowering functions</li>
+  <li>nir: Implement lowering of 64-bit shift operations</li>
+  <li>nir: Add and set info::uses_64bit</li>
+  <li>nir: Create nir_builder in nir_lower_doubles_impl()</li>
+  <li>nir: Add lowering support for 64-bit operations to software</li>
+  <li>nir: Unset metadata debug bit if no progress made</li>
+  <li>intel/compiler: Lower 64-bit MOV/SEL operations</li>
+  <li>intel/compiler: Split 64-bit MOV-indirects if needed</li>
+  <li>intel/compiler: Avoid false positive assertions</li>
+  <li>intel/compiler: Rearrange code to avoid future problems</li>
+  <li>intel/compiler: Prevent warnings in the following patch</li>
+  <li>intel/compiler: Expand size of the 'nr' field</li>
+  <li>intel/compiler: Heap-allocate temporary storage</li>
+  <li>i965: Compile fp64 software routines and lower double-ops</li>
+  <li>i965: Enable 64-bit GLSL extensions</li>
+  <li>i965: Compile fp64 funcs only if we do not have 64-bit hardware support</li>
+  <li>intel/compiler: Reset default flag register in brw_find_live_channel()</li>
+  <li>gallium: Enable ASIMD/NEON on aarch64.</li>
+  <li>gallivm: Return true from arch_rounding_available() if NEON is available</li>
+  <li>intel/compiler: Add a file-level description of brw_eu_validate.c</li>
+  <li>i965: Always compile fp64 funcs when needed</li>
+  <li>nir: Optimize double-precision lower_round_even()</li>
+  <li>intel/compiler: Avoid propagating inequality cmods if types are different</li>
+  <li>intel/compiler/test: Add unit test for mismatched signedness comparison</li>
+</ul>
+
+<p>Mauro Rossi (6):</p>
+<ul>
+  <li>android: gallium/auxiliary: add include to get u_debug.h header</li>
+  <li>android: radv: add libmesa_git_sha1 static dependency</li>
+  <li>android: amd/addrlib: update Mesa's copy of addrlib</li>
+  <li>android: st/mesa: fix building error due to sched_getcpu()</li>
+  <li>android: anv: fix generated files depedencies (v2)</li>
+  <li>android: anv: fix libexpat shared dependency</li>
+</ul>
+
+<p>Maya Rashish (2):</p>
+<ul>
+  <li>radeon: fix printf format specifier.</li>
+  <li>configure: fix test portability</li>
+</ul>
+
+<p>Michal Srb (2):</p>
+<ul>
+  <li>gallium: Constify drisw_loader_funcs struct</li>
+  <li>drisw: Use separate drisw_loader_funcs for shm</li>
+</ul>
+
+<p>Michel Dänzer (4):</p>
+<ul>
+  <li>winsys/amdgpu: Stop using amdgpu_bo_handle_type_kms_noimport</li>
+  <li>winsys/amdgpu: Pull in LLVM CFLAGS</li>
+  <li>amd/common: Restore v4i32 suffix for llvm.SI.load.const intrinsic</li>
+  <li>loader/dri3: Use strlen instead of sizeof for creating VRR property atom</li>
+</ul>
+
+<p>Neha Bhende (1):</p>
+<ul>
+  <li>st/mesa: Fix topogun-1.06-orc-84k-resize.trace crash</li>
+</ul>
+
+<p>Neil Roberts (4):</p>
+<ul>
+  <li>freedreno: Add .dir-locals to the common directory</li>
+  <li>spirv/nir: handle location decorations on block interface members</li>
+  <li>glsl_types: Rename parameter of glsl_count_attribute_slots</li>
+  <li>spirv: Don't use special semantics when counting vertex attribute size</li>
+</ul>
+
+<p>Nicholas Kazlauskas (5):</p>
+<ul>
+  <li>util: Get program name based on path when possible</li>
+  <li>util: Add adaptive_sync driconf option</li>
+  <li>drirc: Initial blacklist for adaptive sync</li>
+  <li>loader/dri3: Enable adaptive_sync via _VARIABLE_REFRESH property</li>
+  <li>radeonsi: Enable adaptive_sync by default for radeon</li>
+</ul>
+
+<p>Nicolai Hähnle (37):</p>
+<ul>
+  <li>radv: include LLVM IR in the VK_AMD_shader_info "disassembly"</li>
+  <li>radeonsi: fix an out-of-bounds read reported by ASAN</li>
+  <li>winsys/amdgpu: add amdgpu_winsys_bo::lock</li>
+  <li>winsys/amdgpu: explicitly declare whether buffer_map is permanent or not</li>
+  <li>egl/wayland: rather obvious build fix</li>
+  <li>radv: remove dependency on addrlib gfx9_enum.h</li>
+  <li>ac/surface/gfx9: let addrlib choose the preferred swizzle kind</li>
+  <li>amd/addrlib: update Mesa's copy of addrlib</li>
+  <li>meson: link LLVM 'native' component when LLVM is available</li>
+  <li>ddebug: simplify watchdog loop and fix crash in the no-timeout case</li>
+  <li>ddebug: always flush when requested, even when hang detection is disabled</li>
+  <li>r600: remove redundant semicolon</li>
+  <li>amd/sid_tables: add additional python3 compatibility imports</li>
+  <li>amd/common: whitespace fixes</li>
+  <li>amd/common: add ac_build_ifcc</li>
+  <li>amd/common: scan/reduce across waves of a workgroup</li>
+  <li>amd/common: add i1 special case to ac_build_{inclusive,exclusive}_scan</li>
+  <li>ac/surface: 3D and cube surfaces are never displayable</li>
+  <li>radeonsi: move SI_FORCE_FAMILY functionality to winsys</li>
+  <li>radeonsi: extract declare_vs_blit_inputs</li>
+  <li>radeonsi: add si_init_draw_functions and make some functions static</li>
+  <li>radeonsi/gfx9: use SET_UCONFIG_REG_INDEX packets when available</li>
+  <li>radeonsi: don't set RAW_WAIT for CP DMA clears</li>
+  <li>radeonsi: rename SI_RESOURCE_FLAG_FORCE_TILING to clarify its purpose</li>
+  <li>radeonsi: const-ify si_set_tesseval_regs</li>
+  <li>radeonsi: show the fixed function TCS in debug dumps</li>
+  <li>radeonsi: avoid using hard-coded SI_NUM_RW_BUFFERS</li>
+  <li>radeonsi: add an si_set_rw_shader_buffer convenience function</li>
+  <li>radeonsi: use si_set_rw_shader_buffer for setting streamout buffers</li>
+  <li>radeonsi: track constant buffer bind history in si_pipe_set_constant_buffer</li>
+  <li>radeonsi: move remaining perfcounter code into si_perfcounter.c</li>
+  <li>radeonsi: move query suspend logic into the top-level si_query struct</li>
+  <li>radeonsi: factor si_query_buffer logic out of si_query_hw</li>
+  <li>radeonsi: split perfcounter queries from si_query_hw</li>
+  <li>radeonsi: const-ify the si_query_ops</li>
+  <li>amd/common: use llvm.amdgcn.s.buffer.load for LLVM 8.0</li>
+  <li>amd/common/vi+: enable SMEM loads with GLC=1</li>
+</ul>
+
+<p>Niklas Haas (3):</p>
+<ul>
+  <li>glsl: fix block member alignment validation for vec3</li>
+  <li>radv: correctly use vulkan 1.0 by default</li>
+  <li>radv: add device-&gt;instance extension dependencies</li>
+</ul>
+
+<p>Olivier Fourdan (1):</p>
+<ul>
+  <li>wayland/egl: Resize EGL surface on update buffer for swrast</li>
+</ul>
+
+<p>Oscar Blumberg (1):</p>
+<ul>
+  <li>radeonsi: Fix guardband computation for large render targets</li>
+</ul>
+
+<p>Pierre Moreau (2):</p>
+<ul>
+  <li>clover/meson: Ignore 'svn' suffix when computing CLANG_RESOURCE_DIR</li>
+  <li>meson: Fix with_gallium_icd to with_opencl_icd</li>
+</ul>
+
+<p>Plamena Manolova (1):</p>
+<ul>
+  <li>nir: Don't lower the local work group size if it's variable.</li>
+</ul>
+
+<p>Rafael Antognolli (24):</p>
+<ul>
+  <li>intel/genxml: Add register for object preemption.</li>
+  <li>i965/gen10+: Enable object level preemption.</li>
+  <li>i965/gen9: Add workarounds for object preemption.</li>
+  <li>anv/tests: Fix block_pool_no_free test.</li>
+  <li>anv/allocator: Add anv_state_table.</li>
+  <li>anv/allocator: Add getter for anv_block_pool.</li>
+  <li>anv/allocator: Add helper to push states back to the state table.</li>
+  <li>anv/allocator: Use anv_state_table on anv_state_pool_alloc.</li>
+  <li>anv/allocator: Use anv_state_table on back_alloc too.</li>
+  <li>anv/allocator: Remove anv_free_list.</li>
+  <li>anv/allocator: Rename anv_free_list2 to anv_free_list.</li>
+  <li>anv/allocator: Remove pool-&gt;map.</li>
+  <li>anv: Update usage of block_pool-&gt;bo.</li>
+  <li>anv/allocator: Add support for a list of BOs in block pool.</li>
+  <li>anv: Split code to add BO dependencies to execbuf.</li>
+  <li>anv: Validate the list of BOs from the block pool.</li>
+  <li>anv: Remove some asserts.</li>
+  <li>anv/allocator: Rework chunk return to the state pool.</li>
+  <li>anv/allocator: Add padding information.</li>
+  <li>anv/allocator: Enable snooping on block pool and anv_bo_pool BOs.</li>
+  <li>anv: Remove state flush.</li>
+  <li>anv/allocator: Add support for non-userptr.</li>
+  <li>anv/tests: Adding test for the state_pool padding.</li>
+  <li>anv/allocator: Avoid race condition in anv_block_pool_map.</li>
+</ul>
+
+<p>Ray Zhang (1):</p>
+<ul>
+  <li>glx: fix shared memory leak in X11</li>
+</ul>
+
+<p>Rhys Kidd (5):</p>
+<ul>
+  <li>travis: radeonsi and radv require LLVM 7.0</li>
+  <li>meson: libfreedreno depends upon libdrm (for fence support)</li>
+  <li>v3d: Wire up core pipe_debug_callback</li>
+  <li>vc4: Wire up core pipe_debug_callback</li>
+  <li>nv50,nvc0: add missing CAPs for unsupported features</li>
+</ul>
+
+<p>Rhys Perry (14):</p>
+<ul>
+  <li>nir: fix constness in nir_intrinsic_align()</li>
+  <li>ac: refactor visit_load_buffer</li>
+  <li>ac: split 16-bit ssbo loads that may not be dword aligned</li>
+  <li>radv: don't set surf_index for stencil-only images</li>
+  <li>radv: switch from nir_bcsel to nir_b32csel</li>
+  <li>ac/nir,radv,radeonsi/nir: use correct indices for interpolation intrinsics</li>
+  <li>nir: fix copy-paste error in nir_lower_constant_initializers</li>
+  <li>radv: use dithered alpha-to-coverage</li>
+  <li>radv: pass radv_draw_info to radv_emit_draw_registers()</li>
+  <li>radv: add missed situations for scissor bug workaround</li>
+  <li>radv: avoid context rolls when binding graphics pipelines</li>
+  <li>radv: prevent dirtying of dynamic state when it does not change</li>
+  <li>radv: bitcast 16-bit outputs to integers</li>
+  <li>radv: ensure export arguments are always float</li>
+</ul>
+
+<p>Rob Clark (79):</p>
+<ul>
+  <li>freedreno: update generated headers</li>
+  <li>freedreno/a6xx: fix VSC bug with larger # of tiles</li>
+  <li>freedreno/drm: fix unused 'entry' warnings</li>
+  <li>freedreno/drm: remove dependency on gallium driver</li>
+  <li>freedreno: move drm to common location</li>
+  <li>freedreno/ir3: standalone compiler updates</li>
+  <li>freedreno: shader_t -&gt; gl_shader_stage</li>
+  <li>freedreno: remove shader_stage_name()</li>
+  <li>freedreno: FD_SHADER_DEBUG -&gt; IR3_SHADER_DEBUG</li>
+  <li>freedreno/ir3: move disasm and optmsgs debug flags</li>
+  <li>util: env_var_as_unsigned() helper</li>
+  <li>freedreno/ir3: use env_var_as_unsigned()</li>
+  <li>freedreno/ir3: some header file cleanup</li>
+  <li>freedreno/ir3: remove pipe_stream_output_info dependency</li>
+  <li>freedreno/ir3: split up ir3_shader</li>
+  <li>freedreno/ir3: remove u_inlines usage</li>
+  <li>freedreno: move ir3 to common location</li>
+  <li>mesa/st: swap order of clear() and clear_with_quad()</li>
+  <li>mesa/st: better colormask check for clear fallback</li>
+  <li>freedreno/a6xx: disable LRZ for z32</li>
+  <li>freedreno/a6xx: set guardband clip</li>
+  <li>freedreno: update generated headers</li>
+  <li>freedreno/a3xx: also set FSSUPERTHREADENABLE</li>
+  <li>freedreno/a6xx: MSAA</li>
+  <li>freedreno: remove unused fd_surface fields</li>
+  <li>gallium: fix typo</li>
+  <li>freedreno/a5xx+a6xx: remove unused fs/vs pvt mem</li>
+  <li>freedreno/drm: fix relocs in nested stateobjs</li>
+  <li>freedreno: update generated headers</li>
+  <li>freedreno/a6xx: blitter fixes</li>
+  <li>freedreno/ir3: don't fetch unused tex components</li>
+  <li>freedreno/ir3: sync instr/disasm</li>
+  <li>freedreno/ir3: code-motion</li>
+  <li>freedreno/ir3: track max flow control depth for a5xx/a6xx</li>
+  <li>freedreno/drm: fix memory leak</li>
+  <li>nir: fix spelling typo</li>
+  <li>mesa/st/nir: fix missing nir_compact_varyings</li>
+  <li>freedreno/drm: sync uapi and enable softpin</li>
+  <li>freedreno: debug GEM obj names</li>
+  <li>freedreno: also set DUMP flag on shaders</li>
+  <li>freedreno/ir3: fix crash</li>
+  <li>freedreno/ir3: don't remove unused input components</li>
+  <li>freedreno/a6xx: fix blitter crash</li>
+  <li>gallium/aux: add is_unorm() helper</li>
+  <li>freedreno: update generated headers</li>
+  <li>freedreno/a6xx: more blitter fixes</li>
+  <li>freedreno: move fd_resource_copy_region()</li>
+  <li>freedreno/a6xx: fix resource_copy_region()</li>
+  <li>freedreno/a6xx: fix corrupted uniforms</li>
+  <li>freedreno/ir3: fix fallout of extra assert</li>
+  <li>freedreno/ir3: don't treat all inputs/outputs as vec4</li>
+  <li>freedreno: combine fd_resource_layer_offset()/fd_resource_offset()</li>
+  <li>freedreno/a6xx: simplify special case for 3d layout</li>
+  <li>freedreno/a6xx: improve setup_slices() debug msgs</li>
+  <li>freedreno: update generated headers</li>
+  <li>freedreno/a6xx: fix 3d texture layout</li>
+  <li>freedreno: skip depth resolve if not written</li>
+  <li>freedreno: rework blit API</li>
+  <li>freedreno: try blitter for fd_resource_copy_region()</li>
+  <li>freedreno/a6xx: rework blitter API</li>
+  <li>freedreno: remove blit_via_copy_region()</li>
+  <li>freedreno: fix staging resource size for arrays</li>
+  <li>freedreno: make cmdstream bo's read-only to GPU</li>
+  <li>freedreno/a6xx: separate stencil restore/resolve fixes</li>
+  <li>freedreno/a6xx: move tile_mode to sampler-view CSO</li>
+  <li>freedreno/a6xx: fix 3d+tiled layout</li>
+  <li>nir/vtn: add caps for some cl related capabilities</li>
+  <li>loader: fix the no-modifiers case</li>
+  <li>freedreno: core buffer modifier support</li>
+  <li>freedreno: set modifier when exporting buffer</li>
+  <li>freedreno: limit tiling to PIPE_BIND_SAMPLER_VIEW</li>
+  <li>freedreno/a2xx: fix unused variable warning</li>
+  <li>freedreno/a5xx: fix blitter nr_samples check</li>
+  <li>freedreno/a6xx: fix blitter nr_samples check</li>
+  <li>freedreno: stop frob'ing pipe_resource::nr_samples</li>
+  <li>freedreno: minor cleanups</li>
+  <li>mesa: wire up InvalidateFramebuffer</li>
+  <li>freedreno: fix release tarball</li>
+  <li>freedreno: more fixing release tarball</li>
+</ul>
+
+<p>Rob Herring (3):</p>
+<ul>
+  <li>pipe-loader: Fallback to kmsro driver when no matching driver name found</li>
+  <li>kmsro: Add etnaviv renderonly support</li>
+  <li>Switch imx to kmsro and remove the imx winsys</li>
+</ul>
+
+<p>Robert Foss (3):</p>
+<ul>
+  <li>virgl: native fence fd support</li>
+  <li>virgl: Clean up fences commit</li>
+  <li>virgl: add assert and missing function parameter</li>
+</ul>
+
+<p>Rodrigo Vivi (1):</p>
+<ul>
+  <li>intel: Add more PCI Device IDs for Coffee Lake and Ice Lake.</li>
+</ul>
+
+<p>Roland Scheidegger (7):</p>
+<ul>
+  <li>gallivm: fix improper clamping of vertex index when fetching gs inputs</li>
+  <li>draw: fix infinite loop in line stippling</li>
+  <li>gallivm: remove unused float coord wrapping for aos sampling</li>
+  <li>gallivm: use llvm jit code for decoding s3tc</li>
+  <li>gallivm: don't use pavg.b intrinsic on llvm &gt;= 6.0</li>
+  <li>gallivm: abort when trying to use non-existing intrinsic</li>
+  <li>Revert "llvmpipe: Always return some fence in flush (v2)"</li>
+</ul>
+
+<p>Sagar Ghuge (14):</p>
+<ul>
+  <li>intel/compiler: Disassemble GEN6_SFID_DATAPORT_SAMPLER_CACHE as dp_sampler</li>
+  <li>intel/compiler: Set swizzle to BRW_SWIZZLE_XXXX for scalar region</li>
+  <li>intel/compiler: Always print flag subregister number</li>
+  <li>nir: Add a new lowering option to lower 3D surfaces from txd to txl.</li>
+  <li>glsl: Add "built-in" functions to do uint64_to_fp64(uint64_t)</li>
+  <li>glsl: Add "built-in" functions to do int64_to_fp64(int64_t)</li>
+  <li>glsl: Add "built-in" functions to do uint64_to_fp32(uint64_t)</li>
+  <li>glsl: Add "built-in" functions to do int64_to_fp32(int64_t)</li>
+  <li>glsl: Add utility function to round and pack uint64_t value</li>
+  <li>glsl: Add "built-in" functions to do fp64_to_uint64(fp64)</li>
+  <li>glsl: Add utility function to round and pack int64_t value</li>
+  <li>glsl: Add "built-in" functions to do fp64_to_int64(fp64)</li>
+  <li>glsl: Add "built-in" functions to do fp32_to_uint64(fp32)</li>
+  <li>glsl: Add "built-in" functions to do fp32_to_int64(fp32)</li>
+</ul>
+
+<p>Samuel Pitoiset (103):</p>
+<ul>
+  <li>radv: remove useless sync after copying query results with compute</li>
+  <li>radv: add missing TFB queries support to CmdCopyQueryPoolsResults()</li>
+  <li>radv: replace si_emit_wait_fence() with radv_cp_wait_mem()</li>
+  <li>radv: more use of radv_cp_wait_mem()</li>
+  <li>radv: allocate enough space in CS when copying query results with compute</li>
+  <li>radv: disable conditional rendering for vkCmdCopyQueryPoolResults()</li>
+  <li>radv: only expose VK_SUBGROUP_FEATURE_ARITHMETIC_BIT for VI+</li>
+  <li>radv: use LOAD_CONTEXT_REG when loading fast clear values</li>
+  <li>radv: fix GPU hangs when loading depth/stencil clear values on SI/CIK</li>
+  <li>radv: cleanup and document a Hawaii bug with offchip buffers</li>
+  <li>radv: clean up setting partial_es_wave for distributed tess on VI</li>
+  <li>radv: make use of num_good_cu_per_sh in si_emit_graphics() too</li>
+  <li>radv: binding streamout buffers doesn't change context regs</li>
+  <li>radv: set PA.SC_CONSERVATIVE_RASTERIZATION.NULL_SQUAD_AA_MASK_ENABLE</li>
+  <li>radv: set optimal OVERWRITE_COMBINER_WATERMARK on GFX9</li>
+  <li>radv: add a debug option for disabling primitive binning</li>
+  <li>radv: enable primitive binning by default</li>
+  <li>radv: tidy up radv_set_dcc_need_cmask_elim_pred()</li>
+  <li>radv: always clear the FCE predicate after DCC/FMASK/CMASK decompressions</li>
+  <li>radv/winsys: remove the max IBs per submit limit for the fallback path</li>
+  <li>radv/winsys: remove the max IBs per submit limit for the sysmem path</li>
+  <li>radv: remove unnecessary goto in the fast clear paths</li>
+  <li>radv: add radv_get_htile_fast_clear_value() helper</li>
+  <li>radv: add radv_is_fast_clear_{depth,stencil}_allowed() helpers</li>
+  <li>radv: check allowed fast HTILE clears a bit earlier</li>
+  <li>radv: rewrite the condition that checks allowed depth/stencil values</li>
+  <li>radv: implement fast HTILE clears for depth or stencil only on GFX9</li>
+  <li>ac/nir: fix intrinsic name string size in visit_image_atomic()</li>
+  <li>radv: ignore subpass self-dependencies</li>
+  <li>radv: only sync CP DMA for transfer operations or bottom pipe</li>
+  <li>radv: remove useless sync after CmdClear{Color,DepthStencil}Image()</li>
+  <li>radv: remove useless sync before CmdClear{Color,DepthStencil}Image()</li>
+  <li>radv: ignore subpass self-dependencies for CreateRenderPass() too</li>
+  <li>radv: remove useless check in emit_fast_color_clear()</li>
+  <li>radv: add radv_image_can_fast_clear() helper</li>
+  <li>radv: add radv_image_view_can_fast_clear() helper</li>
+  <li>radv: add radv_can_fast_clear_{color,depth}() helpers</li>
+  <li>radv: simplify a check in emit_fast_color_clear()</li>
+  <li>radv: refactor the fast clear path for better re-use</li>
+  <li>radv: optimize CmdClear{Color,DepthStencil}Image() for layered textures</li>
+  <li>radv: remove unused pending_clears param in the transition path</li>
+  <li>radv: drop few useless state changes when doing color/depth decompressions</li>
+  <li>radv: rework the TC-compat HTILE hardware bug with COND_EXEC</li>
+  <li>radv: reset pending_reset_query when flushing caches</li>
+  <li>radv: wait on the high 32 bits of timestamp queries</li>
+  <li>spirv: add SpvCapabilityInt64Atomics</li>
+  <li>radv: expose VK_EXT_scalar_block_layout</li>
+  <li>amd: remove support for LLVM 6.0</li>
+  <li>gallium: add missing PIPE_CAP_SURFACE_SAMPLE_COUNT default value</li>
+  <li>radv: bump reported version to 1.1.90</li>
+  <li>radv: add a predicate for reflecting DCC decompression state</li>
+  <li>radv: allow to skip DCC decompressions with the new predicate</li>
+  <li>radv: switch on EOP when primitive restart is enabled with triangle strips</li>
+  <li>radv: check if addrlib enabled HTILE in radv_image_can_enable_htile()</li>
+  <li>radv: don't check if format is depth in radv_image_can_enable_hile()</li>
+  <li>radv: report Vulkan version 1.1.90 for real</li>
+  <li>ac/nir: remove the bitfield_extract workaround for LLVM 8</li>
+  <li>radv: drop the amdgpu-skip-threshold=1 workaround for LLVM 8</li>
+  <li>radv: fix subpass image transitions with multiviews</li>
+  <li>radv: compute optimal VM alignment for imported buffers</li>
+  <li>spirv: add support for SpvCapabilityStorageImageMultisample</li>
+  <li>ac/nir: restrict fmask lookup to image load intrinsics</li>
+  <li>radv: initialize FMASK for images in fully expanded mode</li>
+  <li>radv: add support for FMASK expand</li>
+  <li>radv: enable shaderStorageImageMultisample feature on GFX8+</li>
+  <li>radv: get rid of bunch of KHR suffixes</li>
+  <li>radv: enable variable pointers</li>
+  <li>radv: skip draws with instance_count == 0</li>
+  <li>ac/nir: add get_cache_policy() helper and use it</li>
+  <li>ac/nir: set cache policy when loading/storing buffer images</li>
+  <li>ac: add missing 16-bit types to glsl_base_to_llvm_type()</li>
+  <li>radv: remove unnecessary returns in GetPhysicalDevice*Properties()</li>
+  <li>radv: add two small helpers for getting VRAM and visible VRAM sizes</li>
+  <li>radv: add support for VK_EXT_memory_budget</li>
+  <li>ac/nir: don't trash L1 caches for store operations with writeonly memory</li>
+  <li>radv: drop unused code related to 16 sample locations</li>
+  <li>radv: reduce size of the per-queue descriptor BO</li>
+  <li>radv: do not write unused descriptors to the per-queue BO</li>
+  <li>radv: initialize the per-queue descriptor BO only once</li>
+  <li>nir: do not remove varyings used for transform feedback</li>
+  <li>nir: fix lowering arrays to elements for XFB outputs</li>
+  <li>radv: improve gathering of load_push_constants with dynamic bindings</li>
+  <li>radv: remove old_fence parameter from si_cs_emit_write_event_eop()</li>
+  <li>radv: only allocate the GFX9 fence and EOP BOs for the gfx queue</li>
+  <li>radv: compute the GFX9 fence VA at allocation time</li>
+  <li>radv: always pass the GFX9 fence data to si_cs_emit_cache_flush()</li>
+  <li>radv: fix computing number of user SGPRs for streamout buffers</li>
+  <li>radv: remove radv_userdata_info::indirect field</li>
+  <li>radv: simplify allocating user SGPRS for descriptor sets</li>
+  <li>radv: set noalias/dereferenceable LLVM attributes based on param types</li>
+  <li>radv: re-enable fast depth clears for 16-bit surfaces on VI</li>
+  <li>radv/winsys: fix hash when adding internal buffers</li>
+  <li>radv: fix compiler issues with GCC 9</li>
+  <li>radv: fix using LOAD_CONTEXT_REG with old GFX ME firmwares on GFX8</li>
+  <li>radv/winsys: fix BO list creation when RADV_DEBUG=allbos is set</li>
+  <li>radv: always export gl_SampleMask when the fragment shader uses it</li>
+  <li>radv: write the alpha channel of MRT0 when alpha coverage is enabled</li>
+  <li>radv: fix writing the alpha channel of MRT0 when alpha coverage is enabled</li>
+  <li>radv: fix out-of-bounds access when copying descriptors BO list</li>
+  <li>radv: don't copy buffer descriptors list for samplers</li>
+  <li>radv: fix clearing attachments in secondary command buffers</li>
+  <li>radv: properly align the fence and EOP bug VA on GFX9</li>
+  <li>radv: fix pointSizeRange limits</li>
+</ul>
+
+<p>Sergii Romantsov (4):</p>
+<ul>
+  <li>autotools: library-dependency when no sse and 32-bit</li>
+  <li>i965/batch/debug: Allow log be dumped before assert</li>
+  <li>nir: Length of boolean vtn_value now is 1</li>
+  <li>dri: meson: do not prefix user provided dri-drivers-path</li>
+</ul>
+
+<p>Sonny Jiang (1):</p>
+<ul>
+  <li>radeonsi: use compute for resource_copy_region when possible</li>
+</ul>
+
+<p>Tapani Pälli (27):</p>
+<ul>
+  <li>anv: allow exporting an imported SYNC_FD semaphore type</li>
+  <li>anv: add create_flags as part of anv_image</li>
+  <li>anv: refactor make_surface to use data from anv_image</li>
+  <li>anv: make anv_get_image_format_features public</li>
+  <li>anv: add from/to helpers with android and vulkan formats</li>
+  <li>anv/android: add GetAndroidHardwareBufferPropertiesANDROID</li>
+  <li>anv: add anv_ahw_usage_from_vk_usage helper function</li>
+  <li>anv: refactor, remove else block in AllocateMemory</li>
+  <li>anv/android: support import/export of AHardwareBuffer objects</li>
+  <li>anv/android: add ahardwarebuffer external memory properties</li>
+  <li>anv/android: support creating images from external format</li>
+  <li>anv: support VkExternalFormatANDROID in vkCreateSamplerYcbcrConversion</li>
+  <li>anv: add VkFormat field as part of anv_format</li>
+  <li>anv: support VkSamplerYcbcrConversionInfo in vkCreateImageView</li>
+  <li>anv: ignore VkSamplerYcbcrConversion on non-yuv formats</li>
+  <li>anv/android: turn on VK_ANDROID_external_memory_android_hardware_buffer</li>
+  <li>dri3: initialize adaptive_sync as false before configQueryb</li>
+  <li>intel/isl: move tiled_memcpy static libs from i965 to isl</li>
+  <li>anv: do not advertise AHW support if extension not enabled</li>
+  <li>nir: cleanup glsl_get_struct_field_offset, glsl_get_explicit_stride</li>
+  <li>android: fix build issues with libmesa_anv_gen* libraries</li>
+  <li>mesa: return NULL if we exceed MaxColorAttachments in get_fb_attachment</li>
+  <li>nir: initialize value in copy_prop_vars_block</li>
+  <li>anv: retain the is_array state in create_plane_tex_instr_implicit</li>
+  <li>anv: release memory allocated by glsl types during spirv_to_nir</li>
+  <li>anv: revert "anv: release memory allocated by glsl types during spirv_to_nir"</li>
+  <li>anv: destroy descriptor sets when pool gets destroyed</li>
+</ul>
+
+<p>Thomas Hellstrom (9):</p>
+<ul>
+  <li>st/xa: Render update. Better support for solid pictures</li>
+  <li>st/xa: Support higher color precision for solid pictures</li>
+  <li>st/xa: Support a couple of new formats</li>
+  <li>st/xa: Fix transformations when we have both source and mask samplers</li>
+  <li>st/xa: Minor renderer cleanups</li>
+  <li>st/xa: Support Component Alpha with trivial blending</li>
+  <li>st/xa: Bump minor</li>
+  <li>st/xa: Fix a memory leak</li>
+  <li>winsys/svga: Fix a memory leak</li>
+</ul>
+
+<p>Timothy Arceri (56):</p>
+<ul>
+  <li>nir: allow propagation of if evaluation for bcsel</li>
+  <li>nir: fix condition propagation when src has a swizzle</li>
+  <li>ac/nir_to_llvm: fix b2f for f64</li>
+  <li>nir: add new linking opt nir_link_constant_varyings()</li>
+  <li>st/mesa: make use of nir_link_constant_varyings()</li>
+  <li>nir: add glsl_type_is_integer() helper</li>
+  <li>nir: don't pack varyings ints with floats unless flat</li>
+  <li>anv/i965: make use of nir_link_constant_varyings()</li>
+  <li>nir: add support for removing redundant stores to copy prop var</li>
+  <li>radv: make use of nir_move_out_const_to_consumer()</li>
+  <li>nir: small tidy ups for nir_loop_analyze()</li>
+  <li>nir: clarify some nit_loop_info member names</li>
+  <li>nir: add a new nir_cf_list_clone_and_reinsert() helper</li>
+  <li>nir: make use of new nir_cf_list_clone_and_reinsert() helper</li>
+  <li>nir: factor out some of the complex loop unroll code to a helper</li>
+  <li>nir: rework force_unroll_array_access()</li>
+  <li>nir: in loop analysis track actual control flow type</li>
+  <li>nir: reword code comment</li>
+  <li>nir: detect more induction variables</li>
+  <li>nir: fix opt_if_loop_last_continue()</li>
+  <li>tgsi/scan: fix loop exit point in tgsi_scan_tess_ctrl()</li>
+  <li>tgsi/scan: correctly walk instructions in tgsi_scan_tess_ctrl()</li>
+  <li>radeonsi: remove unrequired param in si_nir_scan_tess_ctrl()</li>
+  <li>ac/nir_to_llvm: add ac_are_tessfactors_def_in_all_invocs()</li>
+  <li>radeonsi: make use of ac_are_tessfactors_def_in_all_invocs()</li>
+  <li>st/glsl_to_nir: call nir_lower_load_const_to_scalar() in the st</li>
+  <li>nir: rename nir_link_constant_varyings() nir_link_opt_varyings()</li>
+  <li>nir: add can_replace_varying() helper</li>
+  <li>nir: rework nir_link_opt_varyings()</li>
+  <li>nir: link time opt duplicate varyings</li>
+  <li>nir: make nir_opt_remove_phis_impl() static</li>
+  <li>nir: make use of does_varying_match() helper</li>
+  <li>nir: simplify does_varying_match()</li>
+  <li>nir: add rewrite_phi_predecessor_blocks() helper</li>
+  <li>nir: merge some basic consecutive ifs</li>
+  <li>st/glsl: refactor st_link_nir()</li>
+  <li>nir: avoid uninitialized variable warning</li>
+  <li>glsl: Copy function out to temp if we don't directly ref a variable</li>
+  <li>ac/nir_to_llvm: fix type handling in image code</li>
+  <li>radeonsi/nir: get correct type for images inside structs</li>
+  <li>ac/nir_to_llvm: fix regression in bindless support</li>
+  <li>ac/nir_to_llvm: add support for structs to get_sampler_desc()</li>
+  <li>glsl: don't skip GLSL IR opts on first-time compiles</li>
+  <li>glsl: be much more aggressive when skipping shader compilation</li>
+  <li>Revert "glsl: be much more aggressive when skipping shader compilation"</li>
+  <li>ac/nir_to_llvm: fix interpolateAt* for arrays</li>
+  <li>glsl: be much more aggressive when skipping shader compilation</li>
+  <li>radeonsi/nir: add missing piece for bindless image support</li>
+  <li>ac/nir_to_llvm: add bindless support for uniform handles</li>
+  <li>ac/nir_to_llvm: fix interpolateAt* for structs</li>
+  <li>ac/nir_to_llvm: fix clamp shadow reference for more hardware</li>
+  <li>tgsi: remove culldist semantic from docs</li>
+  <li>radv/ac: fix some fp16 handling</li>
+  <li>glsl: use remap location when serialising uniform program resource data</li>
+  <li>radeonsi: fix query buffer allocation</li>
+  <li>glsl: fix shader cache for packed param list</li>
+</ul>
+
+<p>Tobias Klausmann (1):</p>
+<ul>
+  <li>amd/vulkan: meson build - use radv_deps for libvulkan_radeon</li>
+</ul>
+
+<p>Tomasz Figa (1):</p>
+<ul>
+  <li>llvmpipe: Always return some fence in flush (v2)</li>
+</ul>
+
+<p>Tomeu Vizoso (1):</p>
+<ul>
+  <li>etnaviv: Consolidate buffer references from framebuffers</li>
+</ul>
+
+<p>Toni Lönnberg (14):</p>
+<ul>
+  <li>intel/decoder: Engine parameter for instructions</li>
+  <li>intel/decoder: tools: gen_engine to drm_i915_gem_engine_class</li>
+  <li>intel/decoder: tools: Use engine for decoding batch instructions</li>
+  <li>intel/genxml: Add engine definition to render engine instructions (gen4)</li>
+  <li>intel/genxml: Add engine definition to render engine instructions (gen45)</li>
+  <li>intel/genxml: Add engine definition to render engine instructions (gen5)</li>
+  <li>intel/genxml: Add engine definition to render engine instructions (gen6)</li>
+  <li>intel/genxml: Add engine definition to render engine instructions (gen7)</li>
+  <li>intel/genxml: Add engine definition to render engine instructions (gen75)</li>
+  <li>intel/genxml: Add engine definition to render engine instructions (gen8)</li>
+  <li>intel/genxml: Add engine definition to render engine instructions (gen9)</li>
+  <li>intel/genxml: Add engine definition to render engine instructions (gen10)</li>
+  <li>intel/genxml: Add engine definition to render engine instructions (gen11)</li>
+  <li>intel/aubinator_error_decode: Get rid of warning for missing switch case</li>
+</ul>
+
+<p>Topi Pohjolainen (1):</p>
+<ul>
+  <li>i965/icl: Disable prefetching of sampler state entries</li>
+</ul>
+
+<p>Veluri Mithun (5):</p>
+<ul>
+  <li>Add extension doc for MESA_query_driver</li>
+  <li>Implement EGL API for MESA_query_driver</li>
+  <li>Implementation of egl dri2 drivers for MESA_query_driver</li>
+  <li>egl: Implement EGL API for MESA_query_driver</li>
+  <li>egl: Implementation of egl dri2 drivers for MESA_query_driver</li>
+</ul>
+
+<p>Vinson Lee (7):</p>
+<ul>
+  <li>r600/sb: Fix constant logical operand in assert.</li>
+  <li>freedreno: Fix autotools build.</li>
+  <li>st/xvmc: Add X11 include path.</li>
+  <li>nir/algebraic: Make algebraic_parser_test.sh executable.</li>
+  <li>meson: Fix typo.</li>
+  <li>meson: Fix libsensors detection.</li>
+  <li>meson: Fix typo.</li>
+</ul>
+
+<p>Yevhenii Kolesnikov (1):</p>
+<ul>
+  <li>i965: Fix allow_higher_compat_version workaround limited by OpenGL 3.0</li>
+</ul>
+
+<p>pal1000 (1):</p>
+<ul>
+  <li>scons: Compatibility with Scons development version string</li>
+</ul>
+
 </ul>
 
 </div>
diff --git a/docs/relnotes/19.0.1.html b/docs/relnotes/19.0.1.html
new file mode 100644
index 00000000000..d5f82f9b022
--- /dev/null
+++ b/docs/relnotes/19.0.1.html
@@ -0,0 +1,159 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 19.0.1 Release Notes / March 27, 2019</h1>
+
+<p>
+Mesa 19.0.1 is a bug fix release which fixes bugs found since the 19.0.0 release.
+</p>
+<p>
+Mesa 19.0.1 implements the OpenGL 4.5 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
+4.5 is <strong>only</strong> available if requested at context creation.
+Compatibility contexts may report a lower version depending on each driver.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+f1dd1980ed628edea3935eed7974fbc5d8353e9578c562728b880d63ac613dbd  mesa-19.0.1.tar.gz
+6884163c0ea9e4c98378ab8fecd72fe7b5f437713a14471beda378df247999d4  mesa-19.0.1.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+
+<h2>Bug fixes</h2>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=100316">Bug 100316</a> - Linking GLSL 1.30 shaders with invariant and deprecated variables triggers an 'mismatching invariant qualifiers' error</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107563">Bug 107563</a> - [RADV] Broken rendering in Unity demos</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109698">Bug 109698</a> - dri.pc contents invalid when built with meson</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109980">Bug 109980</a> - [i915 CI][HSW] spec&#64;arb_fragment_shader_interlock&#64;arb_fragment_shader_interlock-image-load-store - fail</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=110211">Bug 110211</a> - If DESTDIR is set to an empty string, the dri drivers are not installed</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=110221">Bug 110221</a> - build error with meson</li>
+
+</ul>
+
+<h2>Changes</h2>
+
+<p>Andres Gomez (4):</p>
+<ul>
+  <li>glsl: correctly validate component layout qualifier for dvec{3,4}</li>
+  <li>glsl/linker: don't fail non static used inputs without matching outputs</li>
+  <li>glsl/linker: simplify xfb_offset vs xfb_stride overflow check</li>
+  <li>Revert "glsl: relax input-&gt;output validation for SSO programs"</li>
+</ul>
+
+<p>Bas Nieuwenhuizen (2):</p>
+<ul>
+  <li>radv: Use correct image view comparison for fast clears.</li>
+  <li>ac/nir: Return frag_coord as integer.</li>
+</ul>
+
+<p>Danylo Piliaiev (2):</p>
+<ul>
+  <li>anv: Treat zero size XFB buffer as disabled</li>
+  <li>glsl: Cross validate variable's invariance by explicit invariance only</li>
+</ul>
+
+<p>Dave Airlie (1):</p>
+<ul>
+  <li>softpipe: fix texture view crashes</li>
+</ul>
+
+<p>Dylan Baker (5):</p>
+<ul>
+  <li>docs: Add SHA256 sums for 19.0.0</li>
+  <li>cherry-ignore: Add commit that doesn't apply</li>
+  <li>bin/install_megadrivers.py: Correctly handle DESTDIR=''</li>
+  <li>bin/install_megadrivers.py: Fix regression for set DESTDIR</li>
+  <li>bump version for 19.0.1</li>
+</ul>
+
+<p>Eric Anholt (1):</p>
+<ul>
+  <li>v3d: Fix leak of the renderonly struct on screen destruction.</li>
+</ul>
+
+<p>Jason Ekstrand (6):</p>
+<ul>
+  <li>glsl/lower_vector_derefs: Don't use a temporary for TCS outputs</li>
+  <li>glsl/list: Add a list variant of insert_after</li>
+  <li>anv/pass: Flag the need for a RT flush for resolve attachments</li>
+  <li>nir/builder: Add a vector extract helper</li>
+  <li>nir: Add a new pass to lower array dereferences on vectors</li>
+  <li>intel/nir: Lower array-deref-of-vector UBO and SSBO loads</li>
+</ul>
+
+<p>Józef Kucia (2):</p>
+<ul>
+  <li>radv: Fix driverUUID</li>
+  <li>mesa: Fix GL_NUM_DEVICE_UUIDS_EXT</li>
+</ul>
+
+<p>Kenneth Graunke (1):</p>
+<ul>
+  <li>intel/fs: Fix opt_peephole_csel to not throw away saturates.</li>
+</ul>
+
+<p>Kevin Strasser (1):</p>
+<ul>
+  <li>egl/dri: Avoid out of bounds array access</li>
+</ul>
+
+<p>Mark Janes (1):</p>
+<ul>
+  <li>mesa: properly report the length of truncated log messages</li>
+</ul>
+
+<p>Plamena Manolova (1):</p>
+<ul>
+  <li>i965: Disable ARB_fragment_shader_interlock for platforms prior to GEN9</li>
+</ul>
+
+<p>Samuel Pitoiset (3):</p>
+<ul>
+  <li>radv: set the maximum number of IBs per submit to 192</li>
+  <li>radv: always initialize HTILE when the src layout is UNDEFINED</li>
+  <li>radv: fix binding transform feedback buffers</li>
+</ul>
+
+<p>Sergii Romantsov (1):</p>
+<ul>
+  <li>d3d: meson: do not prefix user provided d3d-drivers-path</li>
+</ul>
+
+<p>Tapani Pälli (2):</p>
+<ul>
+  <li>isl: fix automake build when sse41 is not supported</li>
+  <li>anv/radv: release memory allocated by glsl types during spirv_to_nir</li>
+</ul>
+
+
+</div>
+</body>
+</html>
diff --git a/docs/relnotes/19.0.2.html b/docs/relnotes/19.0.2.html
new file mode 100644
index 00000000000..e760bd4ac21
--- /dev/null
+++ b/docs/relnotes/19.0.2.html
@@ -0,0 +1,122 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 19.0.2 Release Notes / April 10, 2019</h1>
+
+<p>
+Mesa 19.0.2 is a bug fix release which fixes bugs found since the 19.0.1 release.
+</p>
+<p>
+Mesa 19.0.2 implements the OpenGL 4.5 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
+4.5 is <strong>only</strong> available if requested at context creation.
+Compatibility contexts may report a lower version depending on each driver.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+SHA256: eb972fc11d4e1261d34ec0b91a701f158d4870c0428fb108353ae7eab64b1118  mesa-19.0.2.tar.gz
+SHA256: 1a2edc3ce56906a676c91e6851298db45903df1f5cb9827395a922c1452db802  mesa-19.0.2.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+
+
+<h2>Bug fixes</h2>
+
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108766">Bug 108766</a> - Mesa built with meson has RPATH entries</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109648">Bug 109648</a> - AMD Raven hang during va-api decoding</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=110257">Bug 110257</a> - Major artifacts in mpeg2 vaapi hw decoding</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=110259">Bug 110259</a> - radv: Sampling depth-stencil image in GENERAL layout returns nothing but zero (regression, bisected)</li>
+
+</ul>
+
+<h2>Changes</h2>
+
+
+<p>Boyuan Zhang (1):</p>
+<ul>
+  <li>st/va: reverse qt matrix back to its original order</li>
+</ul>
+
+<p>Caio Marcelo de Oliveira Filho (1):</p>
+<ul>
+  <li>nir: Take if_uses into account when repairing SSA</li>
+</ul>
+
+<p>Dylan Baker (2):</p>
+<ul>
+  <li>docs: Add SHA256 sums for mesa 19.0.1</li>
+  <li>VERSION: bump version for 19.0.2</li>
+</ul>
+
+<p>Eric Anholt (3):</p>
+<ul>
+  <li>dri3: Return the current swap interval from glXGetSwapIntervalMESA().</li>
+  <li>v3d: Bump the maximum texture size to 4k for V3D 4.x.</li>
+  <li>v3d: Don't try to use the TFU blit path if a scissor is enabled.</li>
+</ul>
+
+<p>Eric Engestrom (1):</p>
+<ul>
+  <li>meson: strip rpath from megadrivers</li>
+</ul>
+
+<p>Jason Ekstrand (1):</p>
+<ul>
+  <li>Revert "anv/radv: release memory allocated by glsl types during spirv_to_nir"</li>
+</ul>
+
+<p>Karol Herbst (1):</p>
+<ul>
+  <li>nir/print: fix printing the image_array intrinsic index</li>
+</ul>
+
+<p>Leo Liu (2):</p>
+<ul>
+  <li>radeon/vcn: add H.264 constrained baseline support</li>
+  <li>radeon/vcn/vp9: search the render target from the whole list</li>
+</ul>
+
+<p>Lionel Landwerlin (1):</p>
+<ul>
+  <li>intel: add dependency on genxml generated files</li>
+</ul>
+
+<p>Marek Olšák (1):</p>
+<ul>
+  <li>radeonsi: fix assertion failure by using the correct type</li>
+</ul>
+
+<p>Samuel Pitoiset (2):</p>
+<ul>
+  <li>radv: skip updating depth/color metadata for conditional rendering</li>
+  <li>radv: do not always initialize HTILE in compressed state</li>
+</ul>
+
+</div>
+</body>
+</html>
diff --git a/docs/relnotes/19.0.3.html b/docs/relnotes/19.0.3.html
new file mode 100644
index 00000000000..d0fe3deb1ba
--- /dev/null
+++ b/docs/relnotes/19.0.3.html
@@ -0,0 +1,148 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 19.0.3 Release Notes / April 24, 2019</h1>
+
+<p>
+Mesa 19.0.3 is a bug fix release which fixes bugs found since the l9.0.2 release.
+</p>
+<p>
+Mesa 19.0.3 implements the OpenGL 4.5 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
+4.5 is <strong>only</strong> available if requested at context creation.
+Compatibility contexts may report a lower version depending on each driver.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+59543ec3c9f8c72990e77887f13d1678cb6739e5d5f56abc21ebf9e772389c5e  mesa-19.0.3.tar.gz
+f027244e38dc309a4c12db45ef79be81ab62c797a50a88d566e4edb6159fc4d5  mesa-19.0.3.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+
+<p>N/A</p>
+
+<h2>Bug fixes</h2>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108879">Bug 108879</a> - [CIK] [regression] All opencl apps hangs indefinitely in si_create_context</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=110201">Bug 110201</a> - [ivb] mesa 19.0.0 breaks rendering in kitty</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=110356">Bug 110356</a> - install_megadrivers.py creates new dangling symlink [bisected]</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=110441">Bug 110441</a> - [llvmpipe] complex-loop-analysis-bug regression</li>
+
+</ul>
+
+<h2>Changes</h2>
+
+<p>Andres Gomez (1):</p>
+<ul>
+  <li>glsl/linker: location aliasing requires types to have the same width</li>
+</ul>
+
+<p>Bas Nieuwenhuizen (1):</p>
+<ul>
+  <li>ac: Move has_local_buffers disable to radeonsi.</li>
+</ul>
+
+<p>Chia-I Wu (1):</p>
+<ul>
+  <li>virgl: fix fence fd version check</li>
+</ul>
+
+<p>Danylo Piliaiev (1):</p>
+<ul>
+  <li>intel/compiler: Do not reswizzle dst if instruction writes to flag register</li>
+</ul>
+
+<p>Dylan Baker (2):</p>
+<ul>
+  <li>docs: Add sha256 sums for 19.0.2</li>
+  <li>Bump version for 19.0.3</li>
+</ul>
+
+<p>Eric Anholt (1):</p>
+<ul>
+  <li>nir: Fix deref offset calculation for structs.</li>
+</ul>
+
+<p>Eric Engestrom (1):</p>
+<ul>
+  <li>meson: remove meson-created megadrivers symlinks</li>
+</ul>
+
+<p>Jason Ekstrand (2):</p>
+<ul>
+  <li>anv/pipeline: Fix MEDIA_VFE_STATE::PerThreadScratchSpace on gen7</li>
+  <li>anv: Add a #define for the max binding table size</li>
+</ul>
+
+<p>Juan A. Suarez Romero (1):</p>
+<ul>
+  <li>meson: Add dependency on genxml to anvil genfiles</li>
+</ul>
+
+<p>Kenneth Graunke (2):</p>
+<ul>
+  <li>glsl: Set location on structure-split sampler uniform variables</li>
+  <li>Revert "glsl: Set location on structure-split sampler uniform variables"</li>
+</ul>
+
+<p>Lionel Landwerlin (2):</p>
+<ul>
+  <li>anv: fix uninitialized pthread cond clock domain</li>
+  <li>intel/devinfo: fix missing num_thread_per_eu on ICL</li>
+</ul>
+
+<p>Lubomir Rintel (2):</p>
+<ul>
+  <li>gallivm: guess CPU features also on ARM</li>
+  <li>gallivm: disable NEON instructions if they are not supported</li>
+</ul>
+
+<p>Marek Olšák (1):</p>
+<ul>
+  <li>radeonsi: use CP DMA for the null const buffer clear on CIK</li>
+</ul>
+
+<p>Rhys Perry (1):</p>
+<ul>
+  <li>nir,ac/nir: fix cube_face_coord</li>
+</ul>
+
+<p>Roland Scheidegger (1):</p>
+<ul>
+  <li>gallivm: fix bogus assert in get_indirect_index</li>
+</ul>
+
+<p>Samuel Pitoiset (2):</p>
+<ul>
+  <li>ac/nir: only use the new raw/struct image atomic intrinsics with LLVM 9+</li>
+  <li>radv: do not load vertex attributes that are not provided by the pipeline</li>
+</ul>
+
+
+</div>
+</body>
+</html>
diff --git a/docs/relnotes/19.0.4.html b/docs/relnotes/19.0.4.html
new file mode 100644
index 00000000000..7c1d493f9ee
--- /dev/null
+++ b/docs/relnotes/19.0.4.html
@@ -0,0 +1,243 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 19.0.4 Release Notes / May 9, 2019</h1>
+
+<p>
+Mesa 19.0.4 is a bug fix release which fixes bugs found since the 19.0.3 release.
+</p>
+<p>
+Mesa 19.0.4 implements the OpenGL 4.5 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
+4.5 is <strong>only</strong> available if requested at context creation.
+Compatibility contexts may report a lower version depending on each driver.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+de361c76bf7aae09219f571b9ae77a34864a1cd9f6ba24c845b18b3cd5e4b9a2  mesa-19.0.4.tar.gz
+39f9f32f448d77388ef817c6098d50eb0c1595815ce7e895dec09dd68774ce47  mesa-19.0.4.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+
+<p>N/A</p>
+
+<h2>Bug fixes</h2>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=99781">Bug 99781</a> - Some Unity games fail assertion on startup in glXCreateContextAttribsARB</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=100239">Bug 100239</a> - Incorrect rendering in CS:GO</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108540">Bug 108540</a> - vkAcquireNextImageKHR blocks when timeout=0 in Wayland</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=110143">Bug 110143</a> - Doom 3: BFG Edition - Steam and GOG.com - white flickering screen</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=110291">Bug 110291</a> - Vega 64 GPU hang running Space Engineers</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=110355">Bug 110355</a> - radeonsi: GTK elements become invisible in some applications (GIMP, LibreOffice)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=110573">Bug 110573</a> - Mesa vulkan-radeon 19.0.3 system freeze and visual artifacts (RADV)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=110590">Bug 110590</a> - [Regression][Bisected] GTAⅣ under wine fails with GLXBadFBConfig</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=110632">Bug 110632</a> - &quot;glx: Fix synthetic error generation in __glXSendError&quot; broke wine games on 32-bit</li>
+
+</ul>
+
+<h2>Changes</h2>
+
+<p>Alejandro Piñeiro (1):</p>
+<ul>
+  <li>docs: document MESA_GLSL=errors keyword</li>
+</ul>
+
+<p>Andrii Simiklit (1):</p>
+<ul>
+  <li>egl: return correct error code for a case req ver &lt; 3 with forward-compatible</li>
+</ul>
+
+<p>Axel Davy (1):</p>
+<ul>
+  <li>st/nine: Fix D3DWindowBuffer_release for old wine nine support</li>
+</ul>
+
+<p>Bas Nieuwenhuizen (1):</p>
+<ul>
+  <li>radv: Disable VK_EXT_descriptor_indexing.</li>
+</ul>
+
+<p>Brian Paul (1):</p>
+<ul>
+  <li>svga: add SVGA_NO_LOGGING env var (v2)</li>
+</ul>
+
+<p>Caio Marcelo de Oliveira Filho (1):</p>
+<ul>
+  <li>spirv: Handle SpvOpDecorateId</li>
+</ul>
+
+<p>Charmaine Lee (1):</p>
+<ul>
+  <li>svga: move host logging to winsys</li>
+</ul>
+
+<p>Chuck Atkins (1):</p>
+<ul>
+  <li>meson: Fix missing glproto dependency for gallium-glx</li>
+</ul>
+
+<p>Daniel Stone (1):</p>
+<ul>
+  <li>vulkan/wsi/wayland: Respect non-blocking AcquireNextImage</li>
+</ul>
+
+<p>Dave Airlie (2):</p>
+<ul>
+  <li>r600: reset tex array override even when no view bound</li>
+  <li>util/bitset: fix bitset range mask calculations.</li>
+</ul>
+
+<p>Dylan Baker (7):</p>
+<ul>
+  <li>docs: Add SHA256 sums for mesa 19.0.3</li>
+  <li>cherry-ignore: Add a patch that was manually backported</li>
+  <li>cherry-ignore: Add more backported patches</li>
+  <li>cherry-ignore: Add another patch</li>
+  <li>cherry-ignore: Add more patches</li>
+  <li>meson: Force the use of config-tool for llvm</li>
+  <li>VERSION: bump for 19.0.4 release</li>
+</ul>
+
+<p>Emil Velikov (3):</p>
+<ul>
+  <li>vulkan/wsi: check if the display_fd given is master</li>
+  <li>vulkan/wsi: don't use DUMB_CLOSE for normal GEM handles</li>
+  <li>configure.ac: check for libdrm when using VL with X11</li>
+</ul>
+
+<p>Erik Faye-Lund (2):</p>
+<ul>
+  <li>softpipe: setup pixel_offset for all primitive types</li>
+  <li>draw: flush when setting stream-out targets</li>
+</ul>
+
+<p>Francisco Jerez (2):</p>
+<ul>
+  <li>intel/fs: Lower integer multiply correctly when destination stride equals 4.</li>
+  <li>intel/fs: Cap dst-aligned region stride to maximum representable hstride value.</li>
+</ul>
+
+<p>Hal Gentz (1):</p>
+<ul>
+  <li>glx: Fix synthetic error generation in __glXSendError</li>
+</ul>
+
+<p>Ian Romanick (2):</p>
+<ul>
+  <li>glsl: Silence may unused parameter warnings in glsl/ir.h</li>
+  <li>mesa: Add missing display list support for GL_FOG_COORDINATE_SOURCE</li>
+</ul>
+
+<p>Jason Ekstrand (1):</p>
+<ul>
+  <li>anv/descriptor_set: Destroy sets before pool finalization</li>
+</ul>
+
+<p>Jon Turney (1):</p>
+<ul>
+  <li>meson: Force '.so' extension for DRI drivers</li>
+</ul>
+
+<p>Juan A. Suarez Romero (2):</p>
+<ul>
+  <li>spirv: add missing SPV_EXT_descriptor_indexing capabilities</li>
+  <li>radv: enable descriptor indexing capabilities</li>
+</ul>
+
+<p>Kenneth Graunke (6):</p>
+<ul>
+  <li>glsl: Allow gl_nir_lower_samplers*() without a gl_shader_program</li>
+  <li>glsl: Don't look at sampler uniform storage for internal vars</li>
+  <li>i965: Ignore uniform storage for samplers or images, use binding info</li>
+  <li>i965: Fix BRW_MEMZONE_LOW_4G heap size.</li>
+  <li>i965: Force VMA alignment to be a multiple of the page size.</li>
+  <li>i965: leave the top 4Gb of the high heap VMA unused</li>
+</ul>
+
+<p>Lionel Landwerlin (4):</p>
+<ul>
+  <li>anv: store heap address bounds when initializing physical device</li>
+  <li>anv: leave the top 4Gb of the high heap VMA unused</li>
+  <li>anv: fix argument name for vkCmdEndQuery</li>
+  <li>anv: rework queries writes to ensure ordering memory writes</li>
+</ul>
+
+<p>Marek Olšák (2):</p>
+<ul>
+  <li>radeonsi/gfx9: set that window_rectangles always roll the context</li>
+  <li>radeonsi/gfx9: rework the gfx9 scissor bug workaround (v2)</li>
+</ul>
+
+<p>Nicolai Hähnle (1):</p>
+<ul>
+  <li>radeonsi: add si_debug_options for convenient adding/removing of options</li>
+</ul>
+
+<p>Rhys Perry (1):</p>
+<ul>
+  <li>radv: fix set_output_usage_mask() with composite and 64-bit types</li>
+</ul>
+
+<p>Ross Burton (1):</p>
+<ul>
+  <li>Revert "meson: drop GLESv1 .so version back to 1.0.0"</li>
+</ul>
+
+<p>Samuel Pitoiset (8):</p>
+<ul>
+  <li>radv: add missing VEGA20 chip in radv_get_device_name()</li>
+  <li>radv: do not need to force emit the TCS regs on Vega20</li>
+  <li>radv: fix color conversions for normalized uint/sint formats</li>
+  <li>radv: implement a workaround for VK_EXT_conditional_rendering</li>
+  <li>radv: set WD_SWITCH_ON_EOP=1 when drawing primitives from a stream output buffer</li>
+  <li>radv: only need to force emit the TCS regs on Vega10 and Raven1</li>
+  <li>radv: apply the indexing workaround for atomic buffer operations on GFX9</li>
+  <li>radv: fix setting the number of rectangles when it's dyanmic</li>
+</ul>
+
+<p>Tapani Pälli (1):</p>
+<ul>
+  <li>anv: expose VK_EXT_queue_family_foreign on Android</li>
+</ul>
+
+<p>Timothy Arceri (4):</p>
+<ul>
+  <li>nir: fix nir_remove_unused_varyings()</li>
+  <li>util/drirc: add workarounds for bugs in Doom 3: BFG</li>
+  <li>radeonsi: add config entry for Counter-Strike Global Offensive</li>
+  <li>Revert "glx: Fix synthetic error generation in __glXSendError"</li>
+</ul>
+
+</div>
+</body>
+</html>
diff --git a/docs/relnotes/19.0.5.html b/docs/relnotes/19.0.5.html
new file mode 100644
index 00000000000..bf0cd43eaaa
--- /dev/null
+++ b/docs/relnotes/19.0.5.html
@@ -0,0 +1,137 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 19.0.5 Release Notes / May 21, 2019</h1>
+
+<p>
+Mesa 19.0.5 is a bug fix release which fixes bugs found since the 19.0.4 release.
+</p>
+<p>
+Mesa 19.0.5 implements the OpenGL 4.5 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
+4.5 is <strong>only</strong> available if requested at context creation.
+Compatibility contexts may report a lower version depending on each driver.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+b6e6b78c23bec15d1e7887c78b7ad00ce395ea1b20ad8aab6ce441f55f724e70  mesa-19.0.5.tar.gz
+6aecb7f67c136768692fb3c33a54196186c6c4fcafab7973516a355e1a54f831  mesa-19.0.5.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+
+<p>N/A</p>
+
+<h2>Bug fixes</h2>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109659">Bug 109659</a> - Missing OpenGL symbols in OSMesa Gallium when building with meson</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=110134">Bug 110134</a> - SIGSEGV while playing large hevc video in mpv</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=110648">Bug 110648</a> - Dota2 will not open using vulkan since 19.0 series</li>
+
+</ul>
+
+<h2>Changes</h2>
+
+<p>Caio Marcelo de Oliveira Filho (2):</p>
+<ul>
+  <li>nir: Fix nir_opt_idiv_const when negatives are involved</li>
+  <li>nir: Fix clone of nir_variable state slots</li>
+</ul>
+
+<p>Charmaine Lee (2):</p>
+<ul>
+  <li>st/mesa: purge framebuffers with current context after unbinding winsys buffers</li>
+  <li>mesa: unreference current winsys buffers when unbinding winsys buffers</li>
+</ul>
+
+<p>Dylan Baker (4):</p>
+<ul>
+  <li>docs: Add SHA256 sums for mesa 19.0.4</li>
+  <li>cherry-ignore: add patches for panfrost</li>
+  <li>cherry-ignore: Add more 19.1 patches</li>
+  <li>bump version to 19.0.5</li>
+</ul>
+
+<p>Eric Engestrom (1):</p>
+<ul>
+  <li>meson: expose glapi through osmesa</li>
+</ul>
+
+<p>Gert Wollny (2):</p>
+<ul>
+  <li>softpipe/buffer: load only as many components as the the buffer resource type provides</li>
+  <li>Revert "softpipe/buffer: load only as many components as the the buffer resource type provides"</li>
+</ul>
+
+<p>Ian Romanick (1):</p>
+<ul>
+  <li>Revert "nir: add late opt to turn inot/b2f combos back to bcsel"</li>
+</ul>
+
+<p>Jason Ekstrand (3):</p>
+<ul>
+  <li>intel/fs/ra: Only add dest interference to sources that exist</li>
+  <li>intel/fs/ra: Stop adding RA interference to too many SENDS nodes</li>
+  <li>anv: Only consider minSampleShading when sampleShadingEnable is set</li>
+</ul>
+
+<p>Józef Kucia (1):</p>
+<ul>
+  <li>radv: clear vertex bindings while resetting command buffer</li>
+</ul>
+
+<p>Kenneth Graunke (1):</p>
+<ul>
+  <li>i965: Fix memory leaks in brw_upload_cs_work_groups_surface().</li>
+</ul>
+
+<p>Leo Liu (1):</p>
+<ul>
+  <li>winsys/amdgpu: add VCN JPEG to no user fence group</li>
+</ul>
+
+<p>Lionel Landwerlin (1):</p>
+<ul>
+  <li>anv: Use corresponding type from the vector allocation</li>
+</ul>
+
+<p>Marek Olšák (1):</p>
+<ul>
+  <li>st/mesa: fix 2 crashes in st_tgsi_lower_yuv</li>
+</ul>
+
+<p>Nanley Chery (1):</p>
+<ul>
+  <li>anv: Fix some depth buffer sampling cases on ICL+</li>
+</ul>
+
+<p>Samuel Pitoiset (1):</p>
+<ul>
+  <li>radv: add a workaround for Monster Hunter World and LLVM 7&amp;8</li>
+</ul>
+
+</div>
+</body>
+</html>
diff --git a/docs/shading.html b/docs/shading.html
index 9e3c7218e31..76f25316f86 100644
--- a/docs/shading.html
+++ b/docs/shading.html
@@ -59,6 +59,7 @@ <h2 id="envvars">Environment Variables</h2>
 <li><b>nopfrag</b> - force fragment shader to be a simple shader that passes
     through the color attribute.
 <li><b>useprog</b> - log glUseProgram calls to stderr
+<li><b>errors</b> - GLSL compilation and link errors will be reported to stderr.
 </ul>
 <p>
 Example:  export MESA_GLSL=dump,nopt
diff --git a/include/GL/internal/dri_interface.h b/include/GL/internal/dri_interface.h
index f2e46f65f92..6d134e3a40f 100644
--- a/include/GL/internal/dri_interface.h
+++ b/include/GL/internal/dri_interface.h
@@ -1352,6 +1352,10 @@ struct __DRIdri2ExtensionRec {
 #define __DRI_IMAGE_FOURCC_YVU422	0x36315659
 #define __DRI_IMAGE_FOURCC_YVU444	0x34325659
 
+#define __DRI_IMAGE_FOURCC_P010		0x30313050
+#define __DRI_IMAGE_FOURCC_P012		0x32313050
+#define __DRI_IMAGE_FOURCC_P016		0x36313050
+
 /**
  * Queryable on images created by createImageFromNames.
  *
diff --git a/include/pci_ids/i965_pci_ids.h b/include/pci_ids/i965_pci_ids.h
index 7201562d824..b91abd7a3f9 100644
--- a/include/pci_ids/i965_pci_ids.h
+++ b/include/pci_ids/i965_pci_ids.h
@@ -171,6 +171,7 @@ CHIPSET(0x3185, glk_2x6, "Intel(R) UHD Graphics 600 (Geminilake 2x6)")
 CHIPSET(0x3E90, cfl_gt1, "Intel(R) UHD Graphics 610 (Coffeelake 2x6 GT1)")
 CHIPSET(0x3E93, cfl_gt1, "Intel(R) UHD Graphics 610 (Coffeelake 2x6 GT1)")
 CHIPSET(0x3E99, cfl_gt1, "Intel(R) HD Graphics (Coffeelake 2x6 GT1)")
+CHIPSET(0x3E9C, cfl_gt1, "Intel(R) HD Graphics (Coffeelake 2x6 GT1)")
 CHIPSET(0x3E91, cfl_gt2, "Intel(R) UHD Graphics 630 (Coffeelake 3x8 GT2)")
 CHIPSET(0x3E92, cfl_gt2, "Intel(R) UHD Graphics 630 (Coffeelake 3x8 GT2)")
 CHIPSET(0x3E96, cfl_gt2, "Intel(R) HD Graphics (Coffeelake 3x8 GT2)")
@@ -203,6 +204,10 @@ CHIPSET(0x5A54, cnl_5x8, "Intel(R) HD Graphics (Cannonlake 5x8 GT2)")
 CHIPSET(0x8A50, icl_8x8, "Intel(R) HD Graphics (Ice Lake 8x8 GT2)")
 CHIPSET(0x8A51, icl_8x8, "Intel(R) HD Graphics (Ice Lake 8x8 GT2)")
 CHIPSET(0x8A52, icl_8x8, "Intel(R) HD Graphics (Ice Lake 8x8 GT2)")
+CHIPSET(0x8A56, icl_4x8, "Intel(R) HD Graphics (Ice Lake 4x8 GT1)")
+CHIPSET(0x8A57, icl_6x8, "Intel(R) HD Graphics (Ice Lake 6x8 GT1.5)")
+CHIPSET(0x8A58, icl_4x8, "Intel(R) HD Graphics (Ice Lake 4x8 GT1)")
+CHIPSET(0x8A59, icl_6x8, "Intel(R) HD Graphics (Ice Lake 6x8 GT1.5)")
 CHIPSET(0x8A5A, icl_6x8, "Intel(R) HD Graphics (Ice Lake 6x8 GT1.5)")
 CHIPSET(0x8A5B, icl_4x8, "Intel(R) HD Graphics (Ice Lake 4x8 GT1)")
 CHIPSET(0x8A5C, icl_6x8, "Intel(R) HD Graphics (Ice Lake 6x8 GT1.5)")
diff --git a/meson.build b/meson.build
index d975b0dbf4b..5286b91c615 100644
--- a/meson.build
+++ b/meson.build
@@ -61,11 +61,11 @@ endif
 
 dri_drivers_path = get_option('dri-drivers-path')
 if dri_drivers_path == ''
-  dri_drivers_path = join_paths(get_option('libdir'), 'dri')
+  dri_drivers_path = join_paths(get_option('prefix'), get_option('libdir'), 'dri')
 endif
 dri_search_path = get_option('dri-search-path')
 if dri_search_path == ''
-  dri_search_path = join_paths(get_option('prefix'), dri_drivers_path)
+  dri_search_path = dri_drivers_path
 endif
 
 with_gles1 = get_option('gles1')
@@ -608,7 +608,7 @@ with_gallium_xa = _xa != 'false'
 
 d3d_drivers_path = get_option('d3d-drivers-path')
 if d3d_drivers_path == ''
-  d3d_drivers_path = join_paths(get_option('libdir'), 'd3d')
+  d3d_drivers_path = join_paths(get_option('prefix'), get_option('libdir'), 'd3d')
 endif
 
 with_gallium_st_nine =  get_option('gallium-nine')
@@ -1213,6 +1213,7 @@ if _llvm != 'false'
       with_gallium_opencl or _llvm == 'true'
     ),
     static : not _shared_llvm,
+    method : 'config-tool',
   )
   with_llvm = dep_llvm.found()
 endif
@@ -1387,12 +1388,14 @@ if with_platform_x11
       dep_xshmfence = dependency('xshmfence', version : '>= 1.1')
     endif
   endif
-  if with_glx == 'dri'
+  if with_glx == 'dri' or with_glx == 'gallium-xlib'
+    dep_glproto = dependency('glproto', version : '>= 1.4.14')
+  endif
+  if with_glx == 'dri' 
     if with_dri_platform == 'drm'
       dep_dri2proto = dependency('dri2proto', version : '>= 2.8')
       dep_xxf86vm = dependency('xxf86vm')
     endif
-    dep_glproto = dependency('glproto', version : '>= 1.4.14')
   endif
   if (with_egl or (
       with_gallium_vdpau or with_gallium_xvmc or with_gallium_xa or
@@ -1400,7 +1403,7 @@ if with_platform_x11
     dep_xcb_xfixes = dependency('xcb-xfixes')
   endif
   if with_xlib_lease
-    dep_xcb_xrandr = dependency('xcb-randr', version : '>= 1.12')
+    dep_xcb_xrandr = dependency('xcb-randr')
     dep_xlib_xrandr = dependency('xrandr', version : '>= 1.3')
   endif
 endif
diff --git a/scons/custom.py b/scons/custom.py
index 09946fa7324..8028990ef61 100644
--- a/scons/custom.py
+++ b/scons/custom.py
@@ -48,7 +48,12 @@
 # a path directly. We want to support both, so we need to detect the SCons version,
 # for which no API is provided by SCons 8-P
 
-scons_version = tuple(map(int, SCons.__version__.split('.')))
+# Scons version string has consistently been in this format:
+# MajorVersion.MinorVersion.Patch[.alpha/beta.yyyymmdd]
+# so this formula should cover all versions regardless of type
+# stable, alpha or beta.
+# For simplicity alpha and beta flags are removed.
+scons_version = tuple(map(int, SCons.__version__.split('.')[:3]))
 
 def quietCommandLines(env):
     # Quiet command lines
diff --git a/scons/gallium.py b/scons/gallium.py
index 963834a5fbc..61bbeb2399f 100755
--- a/scons/gallium.py
+++ b/scons/gallium.py
@@ -308,7 +308,20 @@ def generate(env):
     if env.GetOption('num_jobs') <= 1:
         env.SetOption('num_jobs', num_jobs())
 
-    env.Decider('MD5-timestamp')
+    # Speed up dependency checking.  See
+    # - https://github.com/SCons/scons/wiki/GoFastButton
+    # - https://bugs.freedesktop.org/show_bug.cgi?id=109443
+
+    # Scons version string has consistently been in this format:
+    # MajorVersion.MinorVersion.Patch[.alpha/beta.yyyymmdd]
+    # so this formula should cover all versions regardless of type
+    # stable, alpha or beta.
+    # For simplicity alpha and beta flags are removed.
+
+    scons_version = distutils.version.StrictVersion('.'.join(SCons.__version__.split('.')[:3]))
+    if scons_version < distutils.version.StrictVersion('3.0.2') or \
+       scons_version > distutils.version.StrictVersion('3.0.4'):
+        env.Decider('MD5-timestamp')
     env.SetOption('max_drift', 60)
 
     # C preprocessor options
diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c
index fc8c6a09d2f..7ba13c24953 100644
--- a/src/amd/common/ac_gpu_info.c
+++ b/src/amd/common/ac_gpu_info.c
@@ -367,9 +367,7 @@ bool ac_query_gpu_info(int fd, amdgpu_device_handle dev,
 	info->has_syncobj_wait_for_submit = info->has_syncobj && info->drm_minor >= 20;
 	info->has_fence_to_handle = info->has_syncobj && info->drm_minor >= 21;
 	info->has_ctx_priority = info->drm_minor >= 22;
-	/* TODO: Enable this once the kernel handles it efficiently. */
-	info->has_local_buffers = info->drm_minor >= 20 &&
-				  !info->has_dedicated_vram;
+	info->has_local_buffers = info->drm_minor >= 20;
 	info->kernel_flushes_hdp_before_ib = true;
 	info->htile_cmask_support_1d_tiling = true;
 	info->si_TA_CS_BC_BASE_ADDR_allowed = true;
diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 768364b2dc6..3d7eb7b0421 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -905,6 +905,37 @@ ac_build_fs_interp(struct ac_llvm_context *ctx,
 				  ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
 }
 
+LLVMValueRef
+ac_build_fs_interp_f16(struct ac_llvm_context *ctx,
+		       LLVMValueRef llvm_chan,
+		       LLVMValueRef attr_number,
+		       LLVMValueRef params,
+		       LLVMValueRef i,
+		       LLVMValueRef j)
+{
+	LLVMValueRef args[6];
+	LLVMValueRef p1;
+
+	args[0] = i;
+	args[1] = llvm_chan;
+	args[2] = attr_number;
+	args[3] = ctx->i1false;
+	args[4] = params;
+
+	p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16",
+				ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
+
+	args[0] = p1;
+	args[1] = j;
+	args[2] = llvm_chan;
+	args[3] = attr_number;
+	args[4] = ctx->i1false;
+	args[5] = params;
+
+	return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16",
+				  ctx->f16, args, 6, AC_FUNC_ATTR_READNONE);
+}
+
 LLVMValueRef
 ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
 		       LLVMValueRef parameter,
@@ -923,6 +954,14 @@ ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
 				  ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
 }
 
+LLVMValueRef
+ac_build_gep_ptr(struct ac_llvm_context *ctx,
+	         LLVMValueRef base_ptr,
+	         LLVMValueRef index)
+{
+	return LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
+}
+
 LLVMValueRef
 ac_build_gep0(struct ac_llvm_context *ctx,
 	      LLVMValueRef base_ptr,
diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h
index e47893bbbe6..370e7e9741c 100644
--- a/src/amd/common/ac_llvm_build.h
+++ b/src/amd/common/ac_llvm_build.h
@@ -216,6 +216,14 @@ ac_build_fs_interp(struct ac_llvm_context *ctx,
 		   LLVMValueRef i,
 		   LLVMValueRef j);
 
+LLVMValueRef
+ac_build_fs_interp_f16(struct ac_llvm_context *ctx,
+		       LLVMValueRef llvm_chan,
+		       LLVMValueRef attr_number,
+		       LLVMValueRef params,
+		       LLVMValueRef i,
+		       LLVMValueRef j);
+
 LLVMValueRef
 ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
 		       LLVMValueRef parameter,
@@ -223,6 +231,11 @@ ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
 		       LLVMValueRef attr_number,
 		       LLVMValueRef params);
 
+LLVMValueRef
+ac_build_gep_ptr(struct ac_llvm_context *ctx,
+	         LLVMValueRef base_ptr,
+	         LLVMValueRef index);
+
 LLVMValueRef
 ac_build_gep0(struct ac_llvm_context *ctx,
 	      LLVMValueRef base_ptr,
diff --git a/src/amd/common/ac_llvm_util.c b/src/amd/common/ac_llvm_util.c
index 69446863b95..6063411310b 100644
--- a/src/amd/common/ac_llvm_util.c
+++ b/src/amd/common/ac_llvm_util.c
@@ -151,13 +151,14 @@ static LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family,
 	LLVMTargetRef target = ac_get_llvm_target(triple);
 
 	snprintf(features, sizeof(features),
-		 "+DumpCode,-fp32-denormals,+fp64-denormals%s%s%s%s%s",
+		 "+DumpCode,-fp32-denormals,+fp64-denormals%s%s%s%s%s%s",
 		 HAVE_LLVM >= 0x0800 ? "" : ",+vgpr-spilling",
 		 tm_options & AC_TM_SISCHED ? ",+si-scheduler" : "",
 		 tm_options & AC_TM_FORCE_ENABLE_XNACK ? ",+xnack" : "",
 		 tm_options & AC_TM_FORCE_DISABLE_XNACK ? ",-xnack" : "",
-		 tm_options & AC_TM_PROMOTE_ALLOCA_TO_SCRATCH ? ",-promote-alloca" : "");
-	
+		 tm_options & AC_TM_PROMOTE_ALLOCA_TO_SCRATCH ? ",-promote-alloca" : "",
+		 tm_options & AC_TM_NO_LOAD_STORE_OPT ? ",-load-store-opt" : "");
+
 	LLVMTargetMachineRef tm = LLVMCreateTargetMachine(
 	                             target,
 	                             triple,
diff --git a/src/amd/common/ac_llvm_util.h b/src/amd/common/ac_llvm_util.h
index 6d961c06f8a..ca00540da80 100644
--- a/src/amd/common/ac_llvm_util.h
+++ b/src/amd/common/ac_llvm_util.h
@@ -65,6 +65,7 @@ enum ac_target_machine_options {
 	AC_TM_CHECK_IR = (1 << 5),
 	AC_TM_ENABLE_GLOBAL_ISEL = (1 << 6),
 	AC_TM_CREATE_LOW_OPT = (1 << 7),
+	AC_TM_NO_LOAD_STORE_OPT = (1 << 8),
 };
 
 enum ac_float_mode {
diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index efd3e260af1..a0815995b12 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -1019,10 +1019,17 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
 		LLVMValueRef in[3];
 		for (unsigned chan = 0; chan < 3; chan++)
 			in[chan] = ac_llvm_extract_elem(&ctx->ac, src[0], chan);
-		results[0] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubetc",
+		results[0] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubesc",
 						ctx->ac.f32, in, 3, AC_FUNC_ATTR_READNONE);
-		results[1] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubesc",
+		results[1] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubetc",
 						ctx->ac.f32, in, 3, AC_FUNC_ATTR_READNONE);
+		LLVMValueRef ma = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubema",
+						     ctx->ac.f32, in, 3, AC_FUNC_ATTR_READNONE);
+		results[0] = ac_build_fdiv(&ctx->ac, results[0], ma);
+		results[1] = ac_build_fdiv(&ctx->ac, results[1], ma);
+		LLVMValueRef offset = LLVMConstReal(ctx->ac.f32, 0.5);
+		results[0] = LLVMBuildFAdd(ctx->ac.builder, results[0], offset, "");
+		results[1] = LLVMBuildFAdd(ctx->ac.builder, results[1], offset, "");
 		result = ac_build_gather_values(&ctx->ac, results, 2);
 		break;
 	}
@@ -1896,14 +1903,18 @@ static LLVMValueRef visit_load_var(struct ac_nir_context *ctx,
 	if (var) {
 		bool vs_in = ctx->stage == MESA_SHADER_VERTEX &&
 			var->data.mode == nir_var_shader_in;
-		if (var->data.compact)
-			stride = 1;
 		idx = var->data.driver_location;
 		comp = var->data.location_frac;
 		mode = var->data.mode;
 
 		get_deref_offset(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), vs_in, NULL, NULL,
 				 &const_index, &indir_index);
+
+		if (var->data.compact) {
+			stride = 1;
+			const_index += comp;
+			comp = 0;
+		}
 	}
 
 	if (instr->dest.ssa.bit_size == 64)
@@ -2006,18 +2017,28 @@ static void
 visit_store_var(struct ac_nir_context *ctx,
 		nir_intrinsic_instr *instr)
 {
-        nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
+	nir_deref_instr *deref = nir_instr_as_deref(instr->src[0].ssa->parent_instr);
+	nir_variable *var = nir_deref_instr_get_variable(deref);
 
 	LLVMValueRef temp_ptr, value;
-	int idx = var->data.driver_location;
-	unsigned comp = var->data.location_frac;
+	int idx = 0;
+	unsigned comp = 0;
 	LLVMValueRef src = ac_to_float(&ctx->ac, get_src(ctx, instr->src[1]));
 	int writemask = instr->const_index[0];
 	LLVMValueRef indir_index;
 	unsigned const_index;
 
-	get_deref_offset(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), false,
-	                 NULL, NULL, &const_index, &indir_index);
+	if (var) {
+		get_deref_offset(ctx, deref, false,
+		                 NULL, NULL, &const_index, &indir_index);
+		idx = var->data.driver_location;
+		comp = var->data.location_frac;
+
+		if (var->data.compact) {
+			const_index += comp;
+			comp = 0;
+		}
+	}
 
 	if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src)) == 64) {
 
@@ -2030,7 +2051,7 @@ visit_store_var(struct ac_nir_context *ctx,
 
 	writemask = writemask << comp;
 
-	switch (var->data.mode) {
+	switch (deref->mode) {
 	case nir_var_shader_out:
 
 		if (ctx->stage == MESA_SHADER_TESS_CTRL) {
@@ -2039,8 +2060,8 @@ visit_store_var(struct ac_nir_context *ctx,
 			unsigned const_index = 0;
 			const bool is_patch = var->data.patch;
 
-			get_deref_offset(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
-			                 false, NULL, is_patch ? NULL : &vertex_index,
+			get_deref_offset(ctx, deref, false, NULL,
+			                 is_patch ? NULL : &vertex_index,
 			                 &const_index, &indir_index);
 
 			ctx->abi->store_tcs_outputs(ctx->abi, var,
@@ -2107,7 +2128,7 @@ visit_store_var(struct ac_nir_context *ctx,
 		int writemask = instr->const_index[0];
 		LLVMValueRef address = get_src(ctx, instr->src[0]);
 		LLVMValueRef val = get_src(ctx, instr->src[1]);
-		if (util_is_power_of_two_nonzero(writemask)) {
+		if (writemask == (1u << ac_get_llvm_num_components(val)) - 1) {
 			val = LLVMBuildBitCast(
 			   ctx->ac.builder, val,
 			   LLVMGetElementType(LLVMTypeOf(address)), "");
@@ -2338,10 +2359,12 @@ static void get_image_coords(struct ac_nir_context *ctx,
 }
 
 static LLVMValueRef get_image_buffer_descriptor(struct ac_nir_context *ctx,
-                                                const nir_intrinsic_instr *instr, bool write)
+                                                const nir_intrinsic_instr *instr,
+						bool write, bool atomic)
 {
 	LLVMValueRef rsrc = get_image_descriptor(ctx, instr, AC_DESC_BUFFER, write);
-	if (ctx->abi->gfx9_stride_size_workaround) {
+	if (ctx->abi->gfx9_stride_size_workaround ||
+	    (ctx->abi->gfx9_stride_size_workaround_for_atomic && atomic)) {
 		LLVMValueRef elem_count = LLVMBuildExtractElement(ctx->ac.builder, rsrc, LLVMConstInt(ctx->ac.i32, 2, 0), "");
 		LLVMValueRef stride = LLVMBuildExtractElement(ctx->ac.builder, rsrc, LLVMConstInt(ctx->ac.i32, 1, 0), "");
 		stride = LLVMBuildLShr(ctx->ac.builder, stride, LLVMConstInt(ctx->ac.i32, 16, 0), "");
@@ -2374,7 +2397,7 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx,
 		unsigned num_channels = util_last_bit(mask);
 		LLVMValueRef rsrc, vindex;
 
-		rsrc = get_image_buffer_descriptor(ctx, instr, false);
+		rsrc = get_image_buffer_descriptor(ctx, instr, false, false);
 		vindex = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]),
 						 ctx->ac.i32_0, "");
 
@@ -2418,7 +2441,7 @@ static void visit_image_store(struct ac_nir_context *ctx,
 	if (dim == GLSL_SAMPLER_DIM_BUF) {
 		char name[48];
 		const char *types[] = { "f32", "v2f32", "v4f32" };
-		LLVMValueRef rsrc = get_image_buffer_descriptor(ctx, instr, true);
+		LLVMValueRef rsrc = get_image_buffer_descriptor(ctx, instr, true, false);
 		LLVMValueRef src = ac_to_float(&ctx->ac, get_src(ctx, instr->src[3]));
 		unsigned src_channels = ac_get_llvm_num_components(src);
 
@@ -2514,11 +2537,14 @@ static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx,
 	params[param_count++] = get_src(ctx, instr->src[3]);
 
 	if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) {
-		params[param_count++] = get_image_buffer_descriptor(ctx, instr, true);
+		params[param_count++] = get_image_buffer_descriptor(ctx, instr, true, true);
 		params[param_count++] = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]),
 								ctx->ac.i32_0, ""); /* vindex */
 		params[param_count++] = ctx->ac.i32_0; /* voffset */
-		if (HAVE_LLVM >= 0x800) {
+		if (HAVE_LLVM >= 0x900) {
+			/* XXX: The new raw/struct atomic intrinsics are buggy
+			 * with LLVM 8, see r358579.
+			 */
 			params[param_count++] = ctx->ac.i32_0; /* soffset */
 			params[param_count++] = ctx->ac.i32_0;  /* slc */
 
@@ -3079,7 +3105,8 @@ static void visit_intrinsic(struct ac_nir_context *ctx,
 			ctx->abi->frag_pos[2],
 			ac_build_fdiv(&ctx->ac, ctx->ac.f32_1, ctx->abi->frag_pos[3])
 		};
-		result = ac_build_gather_values(&ctx->ac, values, 4);
+		result = ac_to_integer(&ctx->ac,
+		                       ac_build_gather_values(&ctx->ac, values, 4));
 		break;
 	}
 	case nir_intrinsic_load_front_face:
@@ -3818,6 +3845,73 @@ static void visit_jump(struct ac_llvm_context *ctx,
 	}
 }
 
+static LLVMTypeRef
+glsl_base_to_llvm_type(struct ac_llvm_context *ac,
+		       enum glsl_base_type type)
+{
+	switch (type) {
+	case GLSL_TYPE_INT:
+	case GLSL_TYPE_UINT:
+	case GLSL_TYPE_BOOL:
+	case GLSL_TYPE_SUBROUTINE:
+		return ac->i32;
+	case GLSL_TYPE_INT16:
+	case GLSL_TYPE_UINT16:
+		return ac->i16;
+	case GLSL_TYPE_FLOAT:
+		return ac->f32;
+	case GLSL_TYPE_FLOAT16:
+		return ac->f16;
+	case GLSL_TYPE_INT64:
+	case GLSL_TYPE_UINT64:
+		return ac->i64;
+	case GLSL_TYPE_DOUBLE:
+		return ac->f64;
+	default:
+		unreachable("unknown GLSL type");
+	}
+}
+
+static LLVMTypeRef
+glsl_to_llvm_type(struct ac_llvm_context *ac,
+		  const struct glsl_type *type)
+{
+	if (glsl_type_is_scalar(type)) {
+		return glsl_base_to_llvm_type(ac, glsl_get_base_type(type));
+	}
+
+	if (glsl_type_is_vector(type)) {
+		return LLVMVectorType(
+		   glsl_base_to_llvm_type(ac, glsl_get_base_type(type)),
+		   glsl_get_vector_elements(type));
+	}
+
+	if (glsl_type_is_matrix(type)) {
+		return LLVMArrayType(
+		   glsl_to_llvm_type(ac, glsl_get_column_type(type)),
+		   glsl_get_matrix_columns(type));
+	}
+
+	if (glsl_type_is_array(type)) {
+		return LLVMArrayType(
+		   glsl_to_llvm_type(ac, glsl_get_array_element(type)),
+		   glsl_get_length(type));
+	}
+
+	assert(glsl_type_is_struct(type));
+
+	LLVMTypeRef member_types[glsl_get_length(type)];
+
+	for (unsigned i = 0; i < glsl_get_length(type); i++) {
+		member_types[i] =
+			glsl_to_llvm_type(ac,
+					  glsl_get_struct_field(type, i));
+	}
+
+	return LLVMStructTypeInContext(ac->context, member_types,
+				       glsl_get_length(type), false);
+}
+
 static void visit_deref(struct ac_nir_context *ctx,
                         nir_deref_instr *instr)
 {
@@ -3839,9 +3933,27 @@ static void visit_deref(struct ac_nir_context *ctx,
 		result = ac_build_gep0(&ctx->ac, get_src(ctx, instr->parent),
 		                       get_src(ctx, instr->arr.index));
 		break;
-	case nir_deref_type_cast:
+	case nir_deref_type_ptr_as_array:
+		result = ac_build_gep_ptr(&ctx->ac, get_src(ctx, instr->parent),
+		                          get_src(ctx, instr->arr.index));
+		break;
+	case nir_deref_type_cast: {
 		result = get_src(ctx, instr->parent);
+
+		LLVMTypeRef pointee_type = glsl_to_llvm_type(&ctx->ac, instr->type);
+		LLVMTypeRef type = LLVMPointerType(pointee_type, AC_ADDR_SPACE_LDS);
+
+		if (LLVMTypeOf(result) != type) {
+			if (LLVMGetTypeKind(LLVMTypeOf(result)) == LLVMVectorTypeKind) {
+				result = LLVMBuildBitCast(ctx->ac.builder, result,
+				                          type, "");
+			} else {
+				result = LLVMBuildIntToPtr(ctx->ac.builder, result,
+				                           type, "");
+			}
+		}
 		break;
+	}
 	default:
 		unreachable("Unhandled deref_instr deref type");
 	}
@@ -3990,73 +4102,6 @@ ac_handle_shader_output_decl(struct ac_llvm_context *ctx,
 	}
 }
 
-static LLVMTypeRef
-glsl_base_to_llvm_type(struct ac_llvm_context *ac,
-		       enum glsl_base_type type)
-{
-	switch (type) {
-	case GLSL_TYPE_INT:
-	case GLSL_TYPE_UINT:
-	case GLSL_TYPE_BOOL:
-	case GLSL_TYPE_SUBROUTINE:
-		return ac->i32;
-	case GLSL_TYPE_INT16:
-	case GLSL_TYPE_UINT16:
-		return ac->i16;
-	case GLSL_TYPE_FLOAT:
-		return ac->f32;
-	case GLSL_TYPE_FLOAT16:
-		return ac->f16;
-	case GLSL_TYPE_INT64:
-	case GLSL_TYPE_UINT64:
-		return ac->i64;
-	case GLSL_TYPE_DOUBLE:
-		return ac->f64;
-	default:
-		unreachable("unknown GLSL type");
-	}
-}
-
-static LLVMTypeRef
-glsl_to_llvm_type(struct ac_llvm_context *ac,
-		  const struct glsl_type *type)
-{
-	if (glsl_type_is_scalar(type)) {
-		return glsl_base_to_llvm_type(ac, glsl_get_base_type(type));
-	}
-
-	if (glsl_type_is_vector(type)) {
-		return LLVMVectorType(
-		   glsl_base_to_llvm_type(ac, glsl_get_base_type(type)),
-		   glsl_get_vector_elements(type));
-	}
-
-	if (glsl_type_is_matrix(type)) {
-		return LLVMArrayType(
-		   glsl_to_llvm_type(ac, glsl_get_column_type(type)),
-		   glsl_get_matrix_columns(type));
-	}
-
-	if (glsl_type_is_array(type)) {
-		return LLVMArrayType(
-		   glsl_to_llvm_type(ac, glsl_get_array_element(type)),
-		   glsl_get_length(type));
-	}
-
-	assert(glsl_type_is_struct(type));
-
-	LLVMTypeRef member_types[glsl_get_length(type)];
-
-	for (unsigned i = 0; i < glsl_get_length(type); i++) {
-		member_types[i] =
-			glsl_to_llvm_type(ac,
-					  glsl_get_struct_field(type, i));
-	}
-
-	return LLVMStructTypeInContext(ac->context, member_types,
-				       glsl_get_length(type), false);
-}
-
 static void
 setup_locals(struct ac_nir_context *ctx,
 	     struct nir_function *func)
diff --git a/src/amd/common/ac_shader_abi.h b/src/amd/common/ac_shader_abi.h
index ee18e6c1923..9eb4d37257e 100644
--- a/src/amd/common/ac_shader_abi.h
+++ b/src/amd/common/ac_shader_abi.h
@@ -195,6 +195,7 @@ struct ac_shader_abi {
 	/* Whether to workaround GFX9 ignoring the stride for the buffer size if IDXEN=0
 	* and LLVM optimizes an indexed load with constant index to IDXEN=0. */
 	bool gfx9_stride_size_workaround;
+	bool gfx9_stride_size_workaround_for_atomic;
 };
 
 #endif /* AC_SHADER_ABI_H */
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index 7f7f052986e..1271c3e73f2 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -301,7 +301,6 @@ radv_cmd_buffer_destroy(struct radv_cmd_buffer *cmd_buffer)
 static VkResult
 radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
 {
-
 	cmd_buffer->device->ws->cs_reset(cmd_buffer->cs);
 
 	list_for_each_entry_safe(struct radv_cmd_buffer_upload, up,
@@ -326,6 +325,8 @@ radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
 
 	cmd_buffer->record_result = VK_SUCCESS;
 
+	memset(cmd_buffer->vertex_bindings, 0, sizeof(cmd_buffer->vertex_bindings));
+
 	for (unsigned i = 0; i < VK_PIPELINE_BIND_POINT_RANGE_SIZE; i++) {
 		cmd_buffer->descriptors[i].dirty = 0;
 		cmd_buffer->descriptors[i].valid = 0;
@@ -338,14 +339,15 @@ radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
 		unsigned fence_offset, eop_bug_offset;
 		void *fence_ptr;
 
-		radv_cmd_buffer_upload_alloc(cmd_buffer, 8, 0, &fence_offset,
+		radv_cmd_buffer_upload_alloc(cmd_buffer, 8, 8, &fence_offset,
 					     &fence_ptr);
+
 		cmd_buffer->gfx9_fence_va =
 			radv_buffer_get_va(cmd_buffer->upload.upload_bo);
 		cmd_buffer->gfx9_fence_va += fence_offset;
 
 		/* Allocate a buffer for the EOP bug on GFX9. */
-		radv_cmd_buffer_upload_alloc(cmd_buffer, 16 * num_db, 0,
+		radv_cmd_buffer_upload_alloc(cmd_buffer, 16 * num_db, 8,
 					     &eop_bug_offset, &fence_ptr);
 		cmd_buffer->gfx9_eop_bug_va =
 			radv_buffer_get_va(cmd_buffer->upload.upload_bo);
@@ -416,6 +418,8 @@ radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer *cmd_buffer,
 			     unsigned *out_offset,
 			     void **ptr)
 {
+	assert(util_is_power_of_two_nonzero(alignment));
+
 	uint64_t offset = align(cmd_buffer->upload.offset, alignment);
 	if (offset + size > cmd_buffer->upload.size) {
 		if (!radv_cmd_buffer_resize_upload_buf(cmd_buffer, size))
@@ -1255,7 +1259,7 @@ radv_set_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
 	if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
 		++reg_count;
 
-	radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + reg_count, 0));
+	radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + reg_count, cmd_buffer->state.predicating));
 	radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
 			S_370_WR_CONFIRM(1) |
 			S_370_ENGINE_SEL(V_370_PFP));
@@ -1279,7 +1283,7 @@ radv_set_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer,
 	uint64_t va = radv_buffer_get_va(image->bo);
 	va += image->offset + image->tc_compat_zrange_offset;
 
-	radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
+	radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, cmd_buffer->state.predicating));
 	radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
 			S_370_WR_CONFIRM(1) |
 			S_370_ENGINE_SEL(V_370_PFP));
@@ -1356,7 +1360,7 @@ radv_load_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
 
 	uint32_t reg = R_028028_DB_STENCIL_CLEAR + 4 * reg_offset;
 
-	if (cmd_buffer->device->physical_device->rad_info.chip_class >= VI) {
+	if (cmd_buffer->device->physical_device->has_load_ctx_reg_pkt) {
 		radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG, 3, 0));
 		radeon_emit(cs, va);
 		radeon_emit(cs, va >> 32);
@@ -1473,7 +1477,7 @@ radv_set_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
 
 	assert(radv_image_has_cmask(image) || radv_image_has_dcc(image));
 
-	radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 4, 0));
+	radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 4, cmd_buffer->state.predicating));
 	radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
 			S_370_WR_CONFIRM(1) |
 			S_370_ENGINE_SEL(V_370_PFP));
@@ -1518,14 +1522,13 @@ radv_load_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
 
 	uint32_t reg = R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c;
 
-	if (cmd_buffer->device->physical_device->rad_info.chip_class >= VI) {
+	if (cmd_buffer->device->physical_device->has_load_ctx_reg_pkt) {
 		radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG, 3, cmd_buffer->state.predicating));
 		radeon_emit(cs, va);
 		radeon_emit(cs, va >> 32);
 		radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
 		radeon_emit(cs, 2);
 	} else {
-		/* TODO: Figure out how to use LOAD_CONTEXT_REG on SI/CIK. */
 		radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
 		radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
 				COPY_DATA_DST_SEL(COPY_DATA_REG) |
@@ -2155,6 +2158,7 @@ radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer,
 	ia_multi_vgt_param =
 		si_get_ia_multi_vgt_param(cmd_buffer, draw_info->instance_count > 1,
 					  draw_info->indirect,
+					  !!draw_info->strmout_buffer,
 					  draw_info->indirect ? 0 : draw_info->count);
 
 	if (state->last_ia_multi_vgt_param != ia_multi_vgt_param) {
@@ -4404,10 +4408,15 @@ static void radv_handle_depth_image_transition(struct radv_cmd_buffer *cmd_buffe
 	if (!radv_image_has_htile(image))
 		return;
 
-	if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED &&
-	           radv_layout_has_htile(image, dst_layout, dst_queue_mask)) {
-		/* TODO: merge with the clear if applicable */
-		radv_initialize_htile(cmd_buffer, image, range, 0);
+	if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
+		uint32_t clear_value = vk_format_is_stencil(image->vk_format) ? 0xfffff30f : 0xfffc000f;
+
+		if (radv_layout_is_htile_compressed(image, dst_layout,
+						    dst_queue_mask)) {
+			clear_value = 0;
+		}
+
+		radv_initialize_htile(cmd_buffer, image, range, clear_value);
 	} else if (!radv_layout_is_htile_compressed(image, src_layout, src_queue_mask) &&
 	           radv_layout_is_htile_compressed(image, dst_layout, dst_queue_mask)) {
 		uint32_t clear_value = vk_format_is_stencil(image->vk_format) ? 0xfffff30f : 0xfffc000f;
@@ -4837,8 +4846,11 @@ void radv_CmdBeginConditionalRenderingEXT(
 {
 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
 	RADV_FROM_HANDLE(radv_buffer, buffer, pConditionalRenderingBegin->buffer);
+	struct radeon_cmdbuf *cs = cmd_buffer->cs;
 	bool draw_visible = true;
-	uint64_t va;
+	uint64_t pred_value = 0;
+	uint64_t va, new_va;
+	unsigned pred_offset;
 
 	va = radv_buffer_get_va(buffer->bo) + pConditionalRenderingBegin->offset;
 
@@ -4854,13 +4866,51 @@ void radv_CmdBeginConditionalRenderingEXT(
 
 	si_emit_cache_flush(cmd_buffer);
 
+	/* From the Vulkan spec 1.1.107:
+	 *
+	 * "If the 32-bit value at offset in buffer memory is zero, then the
+	 *  rendering commands are discarded, otherwise they are executed as
+	 *  normal. If the value of the predicate in buffer memory changes while
+	 *  conditional rendering is active, the rendering commands may be
+	 *  discarded in an implementation-dependent way. Some implementations
+	 *  may latch the value of the predicate upon beginning conditional
+	 *  rendering while others may read it before every rendering command."
+	 *
+	 * But, the AMD hardware treats the predicate as a 64-bit value which
+	 * means we need a workaround in the driver. Luckily, it's not required
+	 * to support if the value changes when predication is active.
+	 *
+	 * The workaround is as follows:
+	 * 1) allocate a 64-value in the upload BO and initialize it to 0
+	 * 2) copy the 32-bit predicate value to the upload BO
+	 * 3) use the new allocated VA address for predication
+	 *
+	 * Based on the conditionalrender demo, it's faster to do the COPY_DATA
+	 * in ME  (+ sync PFP) instead of PFP.
+	 */
+	radv_cmd_buffer_upload_data(cmd_buffer, 8, 16, &pred_value, &pred_offset);
+
+	new_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset;
+
+	radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
+	radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
+			COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
+			COPY_DATA_WR_CONFIRM);
+	radeon_emit(cs, va);
+	radeon_emit(cs, va >> 32);
+	radeon_emit(cs, new_va);
+	radeon_emit(cs, new_va >> 32);
+
+	radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
+	radeon_emit(cs, 0);
+
 	/* Enable predication for this command buffer. */
-	si_emit_set_predication_state(cmd_buffer, draw_visible, va);
+	si_emit_set_predication_state(cmd_buffer, draw_visible, new_va);
 	cmd_buffer->state.predicating = true;
 
 	/* Store conditional rendering user info. */
 	cmd_buffer->state.predication_type = draw_visible;
-	cmd_buffer->state.predication_va = va;
+	cmd_buffer->state.predication_va = new_va;
 }
 
 void radv_CmdEndConditionalRenderingEXT(
@@ -4904,7 +4954,7 @@ void radv_CmdBindTransformFeedbackBuffersEXT(
 		enabled_mask |= 1 << idx;
 	}
 
-	cmd_buffer->state.streamout.enabled_mask = enabled_mask;
+	cmd_buffer->state.streamout.enabled_mask |= enabled_mask;
 
 	cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_BUFFER;
 }
diff --git a/src/amd/vulkan/radv_debug.h b/src/amd/vulkan/radv_debug.h
index 01712bd22ce..ac93434b8bd 100644
--- a/src/amd/vulkan/radv_debug.h
+++ b/src/amd/vulkan/radv_debug.h
@@ -51,6 +51,7 @@ enum {
 	RADV_DEBUG_CHECKIR           = 0x200000,
 	RADV_DEBUG_NOTHREADLLVM      = 0x400000,
 	RADV_DEBUG_NOBINNING         = 0x800000,
+	RADV_DEBUG_NO_LOAD_STORE_OPT = 0x1000000,
 };
 
 enum {
diff --git a/src/amd/vulkan/radv_descriptor_set.c b/src/amd/vulkan/radv_descriptor_set.c
index cebe06aa078..68171b5d244 100644
--- a/src/amd/vulkan/radv_descriptor_set.c
+++ b/src/amd/vulkan/radv_descriptor_set.c
@@ -84,7 +84,9 @@ VkResult radv_CreateDescriptorSetLayout(
 	uint32_t immutable_sampler_count = 0;
 	for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) {
 		max_binding = MAX2(max_binding, pCreateInfo->pBindings[j].binding);
-		if (pCreateInfo->pBindings[j].pImmutableSamplers)
+		if ((pCreateInfo->pBindings[j].descriptorType == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER ||
+		     pCreateInfo->pBindings[j].descriptorType == VK_DESCRIPTOR_TYPE_SAMPLER) &&
+		     pCreateInfo->pBindings[j].pImmutableSamplers)
 			immutable_sampler_count += pCreateInfo->pBindings[j].descriptorCount;
 	}
 
@@ -182,7 +184,9 @@ VkResult radv_CreateDescriptorSetLayout(
 			set_layout->has_variable_descriptors = true;
 		}
 
-		if (binding->pImmutableSamplers) {
+		if ((binding->descriptorType == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER ||
+		     binding->descriptorType == VK_DESCRIPTOR_TYPE_SAMPLER) &&
+		    binding->pImmutableSamplers) {
 			set_layout->binding[b].immutable_samplers_offset = samplers_offset;
 			set_layout->binding[b].immutable_samplers_equal =
 				has_equal_immutable_samplers(binding->pImmutableSamplers, binding->descriptorCount);
@@ -965,9 +969,11 @@ void radv_update_descriptor_sets(
 			}
 			src_ptr += src_binding_layout->size / 4;
 			dst_ptr += dst_binding_layout->size / 4;
-			dst_buffer_list[j] = src_buffer_list[j];
-			++src_buffer_list;
-			++dst_buffer_list;
+
+			if (src_binding_layout->type != VK_DESCRIPTOR_TYPE_SAMPLER) {
+				/* Sampler descriptors don't have a buffer list. */
+				dst_buffer_list[j] = src_buffer_list[j];
+			}
 		}
 	}
 }
diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index 34d93b262f8..334c8bd4548 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -111,6 +111,7 @@ radv_get_device_name(enum radeon_family family, char *name, size_t name_len)
 	case CHIP_VEGAM: chip_string = "AMD RADV VEGA M"; break;
 	case CHIP_VEGA10: chip_string = "AMD RADV VEGA10"; break;
 	case CHIP_VEGA12: chip_string = "AMD RADV VEGA12"; break;
+	case CHIP_VEGA20: chip_string = "AMD RADV VEGA20"; break;
 	case CHIP_RAVEN: chip_string = "AMD RADV RAVEN"; break;
 	case CHIP_RAVEN2: chip_string = "AMD RADV RAVEN2"; break;
 	default: chip_string = "AMD RADV unknown"; break;
@@ -337,7 +338,7 @@ radv_physical_device_init(struct radv_physical_device *device,
 	    device->rad_info.chip_class > GFX9)
 		fprintf(stderr, "WARNING: radv is not a conformant vulkan implementation, testing use only.\n");
 
-	radv_get_driver_uuid(&device->device_uuid);
+	radv_get_driver_uuid(&device->driver_uuid);
 	radv_get_device_uuid(&device->rad_info, &device->device_uuid);
 
 	if (device->rad_info.family == CHIP_STONEY ||
@@ -369,6 +370,11 @@ radv_physical_device_init(struct radv_physical_device *device,
 	device->dcc_msaa_allowed =
 		(device->instance->perftest_flags & RADV_PERFTEST_DCC_MSAA);
 
+	/* TODO: Figure out how to use LOAD_CONTEXT_REG on SI/CIK. */
+	device->has_load_ctx_reg_pkt = device->rad_info.chip_class >= GFX9 ||
+				       (device->rad_info.chip_class >= VI &&
+				        device->rad_info.me_fw_feature >= 41);
+
 	radv_physical_device_init_mem_types(device);
 	radv_fill_device_extension_table(device, &device->supported_extensions);
 
@@ -460,6 +466,7 @@ static const struct debug_control radv_debug_options[] = {
 	{"checkir", RADV_DEBUG_CHECKIR},
 	{"nothreadllvm", RADV_DEBUG_NOTHREADLLVM},
 	{"nobinning", RADV_DEBUG_NOBINNING},
+	{"noloadstoreopt", RADV_DEBUG_NO_LOAD_STORE_OPT},
 	{NULL, 0}
 };
 
@@ -505,6 +512,13 @@ radv_handle_per_app_options(struct radv_instance *instance,
 	} else if (!strcmp(name, "DOOM_VFR")) {
 		/* Work around a Doom VFR game bug */
 		instance->debug_flags |= RADV_DEBUG_NO_DYNAMIC_BOUNDS;
+	} else if (!strcmp(name, "MonsterHunterWorld.exe")) {
+		/* Workaround for a WaW hazard when LLVM moves/merges
+		 * load/store memory operations.
+		 * See https://reviews.llvm.org/D61313
+		 */
+		if (HAVE_LLVM < 0x900)
+			instance->debug_flags |= RADV_DEBUG_NO_LOAD_STORE_OPT;
 	}
 }
 
@@ -734,8 +748,7 @@ void radv_GetPhysicalDeviceFeatures(
 		.alphaToOne                               = true,
 		.multiViewport                            = true,
 		.samplerAnisotropy                        = true,
-		.textureCompressionETC2                   = pdevice->rad_info.chip_class >= GFX9 ||
-		                                            pdevice->rad_info.family == CHIP_STONEY,
+		.textureCompressionETC2                   = radv_device_supports_etc(pdevice),
 		.textureCompressionASTC_LDR               = false,
 		.textureCompressionBC                     = true,
 		.occlusionQueryPrecise                    = true,
@@ -802,7 +815,7 @@ void radv_GetPhysicalDeviceFeatures2(
 			features->storageBuffer16BitAccess = enabled;
 			features->uniformAndStorageBuffer16BitAccess = enabled;
 			features->storagePushConstant16 = enabled;
-			features->storageInputOutput16 = enabled;
+			features->storageInputOutput16 = enabled && HAVE_LLVM >= 0x900;
 			break;
 		}
 		case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_YCBCR_CONVERSION_FEATURES: {
@@ -998,7 +1011,7 @@ void radv_GetPhysicalDeviceProperties(
 		.maxCullDistances                         = 8,
 		.maxCombinedClipAndCullDistances          = 8,
 		.discreteQueuePriorities                  = 2,
-		.pointSizeRange                           = { 0.125, 255.875 },
+		.pointSizeRange                           = { 0.0, 8192.0 },
 		.lineWidthRange                           = { 0.0, 7.9921875 },
 		.pointSizeGranularity                     = (1.0 / 8.0),
 		.lineWidthGranularity                     = (1.0 / 128.0),
@@ -2790,7 +2803,7 @@ VkResult radv_QueueSubmit(
 	struct radeon_winsys_fence *base_fence = fence ? fence->fence : NULL;
 	struct radeon_winsys_ctx *ctx = queue->hw_ctx;
 	int ret;
-	uint32_t max_cs_submission = queue->device->trace_bo ? 1 : UINT32_MAX;
+	uint32_t max_cs_submission = queue->device->trace_bo ? 1 : RADV_MAX_IBS_PER_SUBMIT;
 	uint32_t scratch_size = 0;
 	uint32_t compute_scratch_size = 0;
 	uint32_t esgs_ring_size = 0, gsvs_ring_size = 0;
diff --git a/src/amd/vulkan/radv_extensions.py b/src/amd/vulkan/radv_extensions.py
index 1bf56943f25..187c0ba574d 100644
--- a/src/amd/vulkan/radv_extensions.py
+++ b/src/amd/vulkan/radv_extensions.py
@@ -100,7 +100,7 @@ def __init__(self, name, ext_version, enable):
     Extension('VK_EXT_display_control',                   1, 'VK_USE_PLATFORM_DISPLAY_KHR'),
     Extension('VK_EXT_debug_report',                      9, True),
     Extension('VK_EXT_depth_range_unrestricted',          1, True),
-    Extension('VK_EXT_descriptor_indexing',               2, True),
+    Extension('VK_EXT_descriptor_indexing',               2, False),
     Extension('VK_EXT_discard_rectangles',                1, True),
     Extension('VK_EXT_external_memory_dma_buf',           1, True),
     Extension('VK_EXT_external_memory_host',              1, 'device->rad_info.has_userptr'),
diff --git a/src/amd/vulkan/radv_formats.c b/src/amd/vulkan/radv_formats.c
index 499d94befeb..9c61e769ebd 100644
--- a/src/amd/vulkan/radv_formats.c
+++ b/src/amd/vulkan/radv_formats.c
@@ -595,6 +595,14 @@ static bool radv_is_filter_minmax_format_supported(VkFormat format)
 	}
 }
 
+bool
+radv_device_supports_etc(struct radv_physical_device *physical_device)
+{
+	return physical_device->rad_info.family == CHIP_VEGA10 ||
+	       physical_device->rad_info.family == CHIP_RAVEN ||
+	       physical_device->rad_info.family == CHIP_STONEY;
+}
+
 static void
 radv_physical_device_get_format_properties(struct radv_physical_device *physical_device,
 					   VkFormat format,
@@ -612,9 +620,7 @@ radv_physical_device_get_format_properties(struct radv_physical_device *physical
 	}
 
 	if (desc->layout == VK_FORMAT_LAYOUT_ETC &&
-	    physical_device->rad_info.family != CHIP_VEGA10 &&
-	    physical_device->rad_info.family != CHIP_RAVEN &&
-	    physical_device->rad_info.family != CHIP_STONEY) {
+	    !radv_device_supports_etc(physical_device)) {
 		out_properties->linearTilingFeatures = linear;
 		out_properties->optimalTilingFeatures = tiled;
 		out_properties->bufferFeatures = buffer;
@@ -984,10 +990,22 @@ bool radv_format_pack_clear_color(VkFormat format,
 				assert(channel->size == 8);
 
 				v = util_format_linear_float_to_srgb_8unorm(value->float32[c]);
-			} else if (channel->type == VK_FORMAT_TYPE_UNSIGNED) {
-				v = MAX2(MIN2(value->float32[c], 1.0f), 0.0f) * ((1ULL << channel->size) - 1);
-			} else  {
-				v = MAX2(MIN2(value->float32[c], 1.0f), -1.0f) * ((1ULL << (channel->size - 1)) - 1);
+			} else {
+				float f = MIN2(value->float32[c], 1.0f);
+
+				if (channel->type == VK_FORMAT_TYPE_UNSIGNED) {
+					f = MAX2(f, 0.0f) * ((1ULL << channel->size) - 1);
+				} else {
+					f = MAX2(f, -1.0f) * ((1ULL << (channel->size - 1)) - 1);
+				}
+
+				/* The hardware rounds before conversion. */
+				if (f > 0)
+					f += 0.5f;
+				else
+					f -= 0.5f;
+
+				v = (uint64_t)f;
 			}
 		} else if (channel->type == VK_FORMAT_TYPE_FLOAT) {
 			if (channel->size == 32) {
diff --git a/src/amd/vulkan/radv_meta_blit.c b/src/amd/vulkan/radv_meta_blit.c
index ef690edb471..f3a8f6464b8 100644
--- a/src/amd/vulkan/radv_meta_blit.c
+++ b/src/amd/vulkan/radv_meta_blit.c
@@ -849,54 +849,60 @@ build_pipeline(struct radv_device *device,
 		.subpass = 0,
 	};
 
-	switch(aspect) {
-	case VK_IMAGE_ASPECT_COLOR_BIT:
-		vk_pipeline_info.pColorBlendState = &(VkPipelineColorBlendStateCreateInfo) {
-			.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
-			.attachmentCount = 1,
-			.pAttachments = (VkPipelineColorBlendAttachmentState []) {
-				{ .colorWriteMask =
-				VK_COLOR_COMPONENT_A_BIT |
-				VK_COLOR_COMPONENT_R_BIT |
-				VK_COLOR_COMPONENT_G_BIT |
-				VK_COLOR_COMPONENT_B_BIT },
+	VkPipelineColorBlendStateCreateInfo color_blend_info = {
+		.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
+		.attachmentCount = 1,
+		.pAttachments = (VkPipelineColorBlendAttachmentState []) {
+			{
+				.colorWriteMask = VK_COLOR_COMPONENT_A_BIT |
+						  VK_COLOR_COMPONENT_R_BIT |
+						  VK_COLOR_COMPONENT_G_BIT |
+						  VK_COLOR_COMPONENT_B_BIT },
 			}
 		};
+
+	VkPipelineDepthStencilStateCreateInfo depth_info = {
+		.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
+		.depthTestEnable = true,
+		.depthWriteEnable = true,
+		.depthCompareOp = VK_COMPARE_OP_ALWAYS,
+	};
+
+	VkPipelineDepthStencilStateCreateInfo stencil_info = {
+		.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
+		.depthTestEnable = false,
+		.depthWriteEnable = false,
+		.stencilTestEnable = true,
+		.front = {
+			.failOp = VK_STENCIL_OP_REPLACE,
+			.passOp = VK_STENCIL_OP_REPLACE,
+			.depthFailOp = VK_STENCIL_OP_REPLACE,
+			.compareOp = VK_COMPARE_OP_ALWAYS,
+			.compareMask = 0xff,
+			.writeMask = 0xff,
+			.reference = 0
+		},
+		.back = {
+			.failOp = VK_STENCIL_OP_REPLACE,
+			.passOp = VK_STENCIL_OP_REPLACE,
+			.depthFailOp = VK_STENCIL_OP_REPLACE,
+			.compareOp = VK_COMPARE_OP_ALWAYS,
+			.compareMask = 0xff,
+			.writeMask = 0xff,
+			.reference = 0
+		},
+		.depthCompareOp = VK_COMPARE_OP_ALWAYS,
+	};
+
+	switch(aspect) {
+	case VK_IMAGE_ASPECT_COLOR_BIT:
+		vk_pipeline_info.pColorBlendState = &color_blend_info;
 		break;
 	case VK_IMAGE_ASPECT_DEPTH_BIT:
-		vk_pipeline_info.pDepthStencilState = &(VkPipelineDepthStencilStateCreateInfo) {
-			.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
-			.depthTestEnable = true,
-			.depthWriteEnable = true,
-			.depthCompareOp = VK_COMPARE_OP_ALWAYS,
-		};
+		vk_pipeline_info.pDepthStencilState = &depth_info;
 		break;
 	case VK_IMAGE_ASPECT_STENCIL_BIT:
-		vk_pipeline_info.pDepthStencilState = &(VkPipelineDepthStencilStateCreateInfo) {
-			.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
-			.depthTestEnable = false,
-			.depthWriteEnable = false,
-			.stencilTestEnable = true,
-			.front = {
-				.failOp = VK_STENCIL_OP_REPLACE,
-				.passOp = VK_STENCIL_OP_REPLACE,
-				.depthFailOp = VK_STENCIL_OP_REPLACE,
-				.compareOp = VK_COMPARE_OP_ALWAYS,
-				.compareMask = 0xff,
-				.writeMask = 0xff,
-				.reference = 0
-			},
-			.back = {
-				.failOp = VK_STENCIL_OP_REPLACE,
-				.passOp = VK_STENCIL_OP_REPLACE,
-				.depthFailOp = VK_STENCIL_OP_REPLACE,
-				.compareOp = VK_COMPARE_OP_ALWAYS,
-				.compareMask = 0xff,
-				.writeMask = 0xff,
-				.reference = 0
-			},
-			.depthCompareOp = VK_COMPARE_OP_ALWAYS,
-		};
+		vk_pipeline_info.pDepthStencilState = &stencil_info;
 		break;
 	default:
 		unreachable("Unhandled aspect");
diff --git a/src/amd/vulkan/radv_meta_clear.c b/src/amd/vulkan/radv_meta_clear.c
index 8805f0435e1..32af736fd8f 100644
--- a/src/amd/vulkan/radv_meta_clear.c
+++ b/src/amd/vulkan/radv_meta_clear.c
@@ -370,14 +370,29 @@ emit_color_clear(struct radv_cmd_buffer *cmd_buffer,
 	const struct radv_framebuffer *fb = cmd_buffer->state.framebuffer;
 	const uint32_t subpass_att = clear_att->colorAttachment;
 	const uint32_t pass_att = subpass->color_attachments[subpass_att].attachment;
-	const struct radv_image_view *iview = fb->attachments[pass_att].attachment;
-	const uint32_t samples = iview->image->info.samples;
-	const uint32_t samples_log2 = ffs(samples) - 1;
-	unsigned fs_key = radv_format_meta_fs_key(iview->vk_format);
+	const struct radv_image_view *iview = fb ? fb->attachments[pass_att].attachment : NULL;
+	uint32_t samples, samples_log2;
+	VkFormat format;
+	unsigned fs_key;
 	VkClearColorValue clear_value = clear_att->clearValue.color;
 	VkCommandBuffer cmd_buffer_h = radv_cmd_buffer_to_handle(cmd_buffer);
 	VkPipeline pipeline;
 
+	/* When a framebuffer is bound to the current command buffer, get the
+	 * number of samples from it. Otherwise, get the number of samples from
+	 * the render pass because it's likely a secondary command buffer.
+	 */
+	if (iview) {
+		samples = iview->image->info.samples;
+		format = iview->vk_format;
+	} else {
+		samples = cmd_buffer->state.pass->attachments[pass_att].samples;
+		format = cmd_buffer->state.pass->attachments[pass_att].format;
+	}
+
+	samples_log2 = ffs(samples) - 1;
+	fs_key = radv_format_meta_fs_key(format);
+
 	if (fs_key == -1) {
 		radv_finishme("color clears incomplete");
 		return;
@@ -617,6 +632,9 @@ static bool depth_view_can_fast_clear(struct radv_cmd_buffer *cmd_buffer,
 				      const VkClearRect *clear_rect,
 				      VkClearDepthStencilValue clear_value)
 {
+	if (!iview)
+		return false;
+
 	uint32_t queue_mask = radv_image_queue_family_mask(iview->image,
 	                                                   cmd_buffer->queue_family_index,
 	                                                   cmd_buffer->queue_family_index);
@@ -633,7 +651,7 @@ static bool depth_view_can_fast_clear(struct radv_cmd_buffer *cmd_buffer,
 	    iview->base_mip == 0 &&
 	    iview->base_layer == 0 &&
 	    radv_layout_is_htile_compressed(iview->image, layout, queue_mask) &&
-	    !radv_image_extent_compare(iview->image, &iview->extent))
+	    radv_image_extent_compare(iview->image, &iview->extent))
 		return true;
 	return false;
 }
@@ -705,11 +723,22 @@ emit_depthstencil_clear(struct radv_cmd_buffer *cmd_buffer,
 	const uint32_t pass_att = subpass->depth_stencil_attachment.attachment;
 	VkClearDepthStencilValue clear_value = clear_att->clearValue.depthStencil;
 	VkImageAspectFlags aspects = clear_att->aspectMask;
-	const struct radv_image_view *iview = fb->attachments[pass_att].attachment;
-	const uint32_t samples = iview->image->info.samples;
-	const uint32_t samples_log2 = ffs(samples) - 1;
+	const struct radv_image_view *iview = fb ? fb->attachments[pass_att].attachment : NULL;
+	uint32_t samples, samples_log2;
 	VkCommandBuffer cmd_buffer_h = radv_cmd_buffer_to_handle(cmd_buffer);
 
+	/* When a framebuffer is bound to the current command buffer, get the
+	 * number of samples from it. Otherwise, get the number of samples from
+	 * the render pass because it's likely a secondary command buffer.
+	 */
+	if (iview) {
+		samples = iview->image->info.samples;
+	} else {
+		samples = cmd_buffer->state.pass->attachments[pass_att].samples;
+	}
+
+	samples_log2 = ffs(samples) - 1;
+
 	assert(pass_att != VK_ATTACHMENT_UNUSED);
 
 	if (!(aspects & VK_IMAGE_ASPECT_DEPTH_BIT))
@@ -915,7 +944,11 @@ static bool
 radv_image_view_can_fast_clear(struct radv_device *device,
 			       const struct radv_image_view *iview)
 {
-	struct radv_image *image = iview->image;
+	struct radv_image *image;
+
+	if (!iview)
+		return false;
+	image = iview->image;
 
 	/* Only fast clear if the image itself can be fast cleared. */
 	if (!radv_image_can_fast_clear(device, image))
@@ -1523,7 +1556,7 @@ emit_clear(struct radv_cmd_buffer *cmd_buffer,
 		const uint32_t subpass_att = clear_att->colorAttachment;
 		const uint32_t pass_att = subpass->color_attachments[subpass_att].attachment;
 		VkImageLayout image_layout = subpass->color_attachments[subpass_att].layout;
-		const struct radv_image_view *iview = fb->attachments[pass_att].attachment;
+		const struct radv_image_view *iview = fb ? fb->attachments[pass_att].attachment : NULL;
 		VkClearColorValue clear_value = clear_att->clearValue.color;
 
 		if (radv_can_fast_clear_color(cmd_buffer, iview, image_layout,
@@ -1536,8 +1569,11 @@ emit_clear(struct radv_cmd_buffer *cmd_buffer,
 		}
 	} else {
 		const uint32_t pass_att = subpass->depth_stencil_attachment.attachment;
+		if (pass_att == VK_ATTACHMENT_UNUSED)
+			return;
+
 		VkImageLayout image_layout = subpass->depth_stencil_attachment.layout;
-		const struct radv_image_view *iview = fb->attachments[pass_att].attachment;
+		const struct radv_image_view *iview = fb ? fb->attachments[pass_att].attachment : NULL;
 		VkClearDepthStencilValue clear_value = clear_att->clearValue.depthStencil;
 
 		assert(aspects & (VK_IMAGE_ASPECT_DEPTH_BIT |
diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c
index e80938527e5..00d65de8164 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -92,6 +92,7 @@ struct radv_shader_context {
 	gl_shader_stage stage;
 
 	LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS * 4];
+	uint64_t float16_shaded_mask;
 
 	uint64_t input_mask;
 	uint64_t output_mask;
@@ -1441,7 +1442,7 @@ store_tcs_output(struct ac_shader_abi *abi,
 {
 	struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
 	const unsigned location = var->data.location;
-	const unsigned component = var->data.location_frac;
+	unsigned component = var->data.location_frac;
 	const bool is_patch = var->data.patch;
 	const bool is_compact = var->data.compact;
 	LLVMValueRef dw_addr;
@@ -1459,10 +1460,14 @@ store_tcs_output(struct ac_shader_abi *abi,
 	}
 
 	param = shader_io_get_unique_index(location);
-	if (location == VARYING_SLOT_CLIP_DIST0 &&
-	    is_compact && const_index > 3) {
-		const_index -= 3;
-		param++;
+	if ((location == VARYING_SLOT_CLIP_DIST0 || location == VARYING_SLOT_CLIP_DIST1) && is_compact) {
+		const_index += component;
+		component = 0;
+
+		if (const_index >= 4) {
+			const_index -= 4;
+			param++;
+		}
 	}
 
 	if (!is_patch) {
@@ -1529,9 +1534,13 @@ load_tes_input(struct ac_shader_abi *abi,
 	LLVMValueRef result;
 	unsigned param = shader_io_get_unique_index(location);
 
-	if (location == VARYING_SLOT_CLIP_DIST0 && is_compact && const_index > 3) {
-		const_index -= 3;
-		param++;
+	if ((location == VARYING_SLOT_CLIP_DIST0 || location == VARYING_SLOT_CLIP_DIST1) && is_compact) {
+		const_index += component;
+		component = 0;
+		if (const_index >= 4) {
+			const_index -= 4;
+			param++;
+		}
 	}
 
 	buf_addr = get_tcs_tes_buffer_address_params(ctx, param, const_index,
@@ -2018,10 +2027,32 @@ handle_vs_input_decl(struct radv_shader_context *ctx,
 
 		t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);
 
-		input = ac_build_buffer_load_format(&ctx->ac, t_list,
-						    buffer_index,
-						    ctx->ac.i32_0,
-						    num_channels, false, true);
+		if (ctx->options->key.vs.vertex_attribute_provided & (1u << attrib_index)) {
+			input = ac_build_buffer_load_format(&ctx->ac, t_list,
+							    buffer_index,
+							    ctx->ac.i32_0,
+							    num_channels, false, true);
+		} else {
+			/* Per the Vulkan spec, it's invalid to consume vertex
+			 * attributes that are not provided by the pipeline but
+			 * some (invalid) apps appear to do that. Fill the
+			 * input array with (eg. (0, 0, 0, 1)) to workaround
+			 * the problem and to avoid possible GPU hangs.
+			 */
+			LLVMValueRef chan[4];
+
+			/* The input_usage mask might be 0 if input variables
+			 * are not removed by the compiler.
+			 */
+			num_channels = CLAMP(num_channels, 1, 4);
+
+			for (unsigned i = 0; i < num_channels; i++) {
+				chan[i] = i == 3 ? ctx->ac.f32_1 : ctx->ac.f32_0;
+				chan[i] = ac_to_float(&ctx->ac, chan[i]);
+			}
+
+			input = ac_build_gather_values(&ctx->ac, chan, num_channels);
+		}
 
 		input = ac_build_expand_to_vec4(&ctx->ac, input, num_channels);
 
@@ -2051,6 +2082,7 @@ static void interp_fs_input(struct radv_shader_context *ctx,
 			    unsigned attr,
 			    LLVMValueRef interp_param,
 			    LLVMValueRef prim_mask,
+			    bool float16,
 			    LLVMValueRef result[4])
 {
 	LLVMValueRef attr_number;
@@ -2083,7 +2115,12 @@ static void interp_fs_input(struct radv_shader_context *ctx,
 	for (chan = 0; chan < 4; chan++) {
 		LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false);
 
-		if (interp) {
+		if (interp && float16) {
+			result[chan] = ac_build_fs_interp_f16(&ctx->ac,
+							      llvm_chan,
+							      attr_number,
+							      prim_mask, i, j);
+		} else if (interp) {
 			result[chan] = ac_build_fs_interp(&ctx->ac,
 							  llvm_chan,
 							  attr_number,
@@ -2095,7 +2132,30 @@ static void interp_fs_input(struct radv_shader_context *ctx,
 							      attr_number,
 							      prim_mask);
 			result[chan] = LLVMBuildBitCast(ctx->ac.builder, result[chan], ctx->ac.i32, "");
-			result[chan] = LLVMBuildTruncOrBitCast(ctx->ac.builder, result[chan], LLVMTypeOf(interp_param), "");
+			result[chan] = LLVMBuildTruncOrBitCast(ctx->ac.builder, result[chan], float16 ? ctx->ac.i16 : ctx->ac.i32, "");
+		}
+	}
+}
+
+static void mark_16bit_fs_input(struct radv_shader_context *ctx,
+                                const struct glsl_type *type,
+                                int location)
+{
+	if (glsl_type_is_scalar(type) || glsl_type_is_vector(type) || glsl_type_is_matrix(type)) {
+		unsigned attrib_count = glsl_count_attribute_slots(type, false);
+		if (glsl_type_is_16bit(type)) {
+			ctx->float16_shaded_mask |= ((1ull << attrib_count) - 1) << location;
+		}
+	} else if (glsl_type_is_array(type)) {
+		unsigned stride = glsl_count_attribute_slots(glsl_get_array_element(type), false);
+		for (unsigned i = 0; i < glsl_get_length(type); ++i) {
+			mark_16bit_fs_input(ctx, glsl_get_array_element(type), location + i * stride);
+		}
+	} else {
+		assert(glsl_type_is_struct(type));
+		for (unsigned i = 0; i < glsl_get_length(type); i++) {
+			mark_16bit_fs_input(ctx, glsl_get_struct_field(type, i), location);
+			location += glsl_count_attribute_slots(glsl_get_struct_field(type, i), false);
 		}
 	}
 }
@@ -2110,9 +2170,20 @@ handle_fs_input_decl(struct radv_shader_context *ctx,
 	uint64_t mask;
 
 	variable->data.driver_location = idx * 4;
+
+
+	if (variable->data.compact) {
+		unsigned component_count = variable->data.location_frac +
+		                           glsl_get_length(variable->type);
+		attrib_count = (component_count + 3) / 4;
+	} else
+		mark_16bit_fs_input(ctx, variable->type, idx);
+
 	mask = ((1ull << attrib_count) - 1) << variable->data.location;
 
-	if (glsl_get_base_type(glsl_without_array(variable->type)) == GLSL_TYPE_FLOAT) {
+	if (glsl_get_base_type(glsl_without_array(variable->type)) == GLSL_TYPE_FLOAT ||
+	    glsl_get_base_type(glsl_without_array(variable->type)) == GLSL_TYPE_FLOAT16 ||
+	    glsl_get_base_type(glsl_without_array(variable->type)) == GLSL_TYPE_STRUCT) {
 		unsigned interp_type;
 		if (variable->data.sample)
 			interp_type = INTERP_SAMPLE;
@@ -2123,22 +2194,12 @@ handle_fs_input_decl(struct radv_shader_context *ctx,
 
 		interp = lookup_interp_param(&ctx->abi, variable->data.interpolation, interp_type);
 	}
-	bool is_16bit = glsl_type_is_16bit(glsl_without_array(variable->type));
-	LLVMTypeRef type = is_16bit ? ctx->ac.i16 : ctx->ac.i32;
 	if (interp == NULL)
-		interp = LLVMGetUndef(type);
+		interp = LLVMGetUndef(ctx->ac.i32);
 
 	for (unsigned i = 0; i < attrib_count; ++i)
 		ctx->inputs[ac_llvm_reg_index_soa(idx + i, 0)] = interp;
 
-	if (idx == VARYING_SLOT_CLIP_DIST0) {
-		/* Do not account for the number of components inside the array
-		 * of clip/cull distances because this might wrongly set other
-		 * bits like primitive ID or layer.
-		 */
-		mask = 1ull << VARYING_SLOT_CLIP_DIST0;
-	}
-
 	ctx->input_mask |= mask;
 }
 
@@ -2200,11 +2261,14 @@ handle_fs_inputs(struct radv_shader_context *ctx,
 		if (i >= VARYING_SLOT_VAR0 || i == VARYING_SLOT_PNTC ||
 		    i == VARYING_SLOT_PRIMITIVE_ID || i == VARYING_SLOT_LAYER) {
 			interp_param = *inputs;
-			interp_fs_input(ctx, index, interp_param, ctx->abi.prim_mask,
+			bool float16 = (ctx->float16_shaded_mask >> i) & 1;
+			interp_fs_input(ctx, index, interp_param, ctx->abi.prim_mask, float16,
 					inputs);
 
 			if (LLVMIsUndef(interp_param))
 				ctx->shader_info->fs.flat_shaded_mask |= 1u << index;
+			if (float16)
+				ctx->shader_info->fs.float16_shaded_mask |= 1u << index;
 			if (i >= VARYING_SLOT_VAR0)
 				ctx->abi.fs_input_attr_indices[i - VARYING_SLOT_VAR0] = index;
 			++index;
@@ -2216,7 +2280,7 @@ handle_fs_inputs(struct radv_shader_context *ctx,
 
 				interp_param = *inputs;
 				interp_fs_input(ctx, index, interp_param,
-						ctx->abi.prim_mask, inputs);
+						ctx->abi.prim_mask, false, inputs);
 				++index;
 			}
 		} else if (i == VARYING_SLOT_POS) {
@@ -2250,6 +2314,12 @@ scan_shader_output_decl(struct radv_shader_context *ctx,
 	if (stage == MESA_SHADER_TESS_CTRL)
 		return;
 
+	if (variable->data.compact) {
+		unsigned component_count = variable->data.location_frac +
+		                           glsl_get_length(variable->type);
+		attrib_count = (component_count + 3) / 4;
+	}
+
 	mask_attribs = ((1ull << attrib_count) - 1) << idx;
 	if (stage == MESA_SHADER_VERTEX ||
 	    stage == MESA_SHADER_TESS_EVAL ||
@@ -2265,8 +2335,6 @@ scan_shader_output_decl(struct radv_shader_context *ctx,
 				ctx->shader_info->tes.outinfo.cull_dist_mask = (1 << shader->info.cull_distance_array_size) - 1;
 				ctx->shader_info->tes.outinfo.cull_dist_mask <<= shader->info.clip_distance_array_size;
 			}
-
-			mask_attribs = 1ull << idx;
 		}
 	}
 
@@ -2365,7 +2433,7 @@ si_llvm_init_export_args(struct radv_shader_context *ctx,
 			if (is_16bit) {
 				for (unsigned chan = 0; chan < 4; chan++)
 					values[chan] = LLVMBuildZExt(ctx->ac.builder,
-								      values[chan],
+								      ac_to_integer(&ctx->ac, values[chan]),
 								      ctx->ac.i32, "");
 			}
 			break;
@@ -2376,7 +2444,7 @@ si_llvm_init_export_args(struct radv_shader_context *ctx,
 			if (is_16bit) {
 				for (unsigned chan = 0; chan < 4; chan++)
 					values[chan] = LLVMBuildSExt(ctx->ac.builder,
-								      values[chan],
+								      ac_to_integer(&ctx->ac, values[chan]),
 								      ctx->ac.i32, "");
 			}
 			break;
@@ -2429,12 +2497,8 @@ si_llvm_init_export_args(struct radv_shader_context *ctx,
 	} else
 		memcpy(&args->out[0], values, sizeof(values[0]) * 4);
 
-	for (unsigned i = 0; i < 4; ++i) {
-		if (!(args->enabled_channels & (1 << i)))
-			continue;
-
+	for (unsigned i = 0; i < 4; ++i)
 		args->out[i] = ac_to_float(&ctx->ac, args->out[i]);
-	}
 }
 
 static void
@@ -2615,51 +2679,41 @@ handle_vs_outputs_post(struct radv_shader_context *ctx,
 	memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
 	       sizeof(outinfo->vs_output_param_offset));
 
-	if (ctx->output_mask & (1ull << VARYING_SLOT_CLIP_DIST0)) {
-		unsigned output_usage_mask, length;
-		LLVMValueRef slots[8];
-		unsigned j;
-
-		if (ctx->stage == MESA_SHADER_VERTEX &&
-		    !ctx->is_gs_copy_shader) {
-			output_usage_mask =
-				ctx->shader_info->info.vs.output_usage_mask[VARYING_SLOT_CLIP_DIST0];
-		} else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
-			output_usage_mask =
-				ctx->shader_info->info.tes.output_usage_mask[VARYING_SLOT_CLIP_DIST0];
-		} else {
-			assert(ctx->is_gs_copy_shader);
-			output_usage_mask =
-				ctx->shader_info->info.gs.output_usage_mask[VARYING_SLOT_CLIP_DIST0];
-		}
+	for(unsigned location = VARYING_SLOT_CLIP_DIST0; location <= VARYING_SLOT_CLIP_DIST1; ++location) {
+		if (ctx->output_mask & (1ull << location)) {
+			unsigned output_usage_mask, length;
+			LLVMValueRef slots[4];
+			unsigned j;
+
+			if (ctx->stage == MESA_SHADER_VERTEX &&
+			!ctx->is_gs_copy_shader) {
+				output_usage_mask =
+					ctx->shader_info->info.vs.output_usage_mask[location];
+			} else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
+				output_usage_mask =
+					ctx->shader_info->info.tes.output_usage_mask[location];
+			} else {
+				assert(ctx->is_gs_copy_shader);
+				output_usage_mask =
+					ctx->shader_info->info.gs.output_usage_mask[location];
+			}
 
-		length = util_last_bit(output_usage_mask);
+			length = util_last_bit(output_usage_mask);
 
-		i = VARYING_SLOT_CLIP_DIST0;
-		for (j = 0; j < length; j++)
-			slots[j] = ac_to_float(&ctx->ac, radv_load_output(ctx, i, j));
+			for (j = 0; j < length; j++)
+				slots[j] = ac_to_float(&ctx->ac, radv_load_output(ctx, location, j));
 
-		for (i = length; i < 8; i++)
-			slots[i] = LLVMGetUndef(ctx->ac.f32);
+			for (i = length; i < 4; i++)
+				slots[i] = LLVMGetUndef(ctx->ac.f32);
 
-		if (length > 4) {
-			target = V_008DFC_SQ_EXP_POS + 3;
-			si_llvm_init_export_args(ctx, &slots[4], 0xf, target, &args);
+			target = V_008DFC_SQ_EXP_POS + 2 + (location - VARYING_SLOT_CLIP_DIST0);
+			si_llvm_init_export_args(ctx, &slots[0], 0xf, target, &args);
 			memcpy(&pos_args[target - V_008DFC_SQ_EXP_POS],
-			       &args, sizeof(args));
-		}
+			&args, sizeof(args));
 
-		target = V_008DFC_SQ_EXP_POS + 2;
-		si_llvm_init_export_args(ctx, &slots[0], 0xf, target, &args);
-		memcpy(&pos_args[target - V_008DFC_SQ_EXP_POS],
-		       &args, sizeof(args));
-
-		/* Export the clip/cull distances values to the next stage. */
-		radv_export_param(ctx, param_count, &slots[0], 0xf);
-		outinfo->vs_output_param_offset[VARYING_SLOT_CLIP_DIST0] = param_count++;
-		if (length > 4) {
-			radv_export_param(ctx, param_count, &slots[4], 0xf);
-			outinfo->vs_output_param_offset[VARYING_SLOT_CLIP_DIST1] = param_count++;
+			/* Export the clip/cull distances values to the next stage. */
+			radv_export_param(ctx, param_count, &slots[0], 0xf);
+			outinfo->vs_output_param_offset[location] = param_count++;
 		}
 	}
 
@@ -2820,28 +2874,14 @@ handle_es_outputs_post(struct radv_shader_context *ctx,
 	LLVMValueRef lds_base = NULL;
 
 	for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
-		unsigned output_usage_mask;
 		int param_index;
-		int length = 4;
 
 		if (!(ctx->output_mask & (1ull << i)))
 			continue;
 
-		if (ctx->stage == MESA_SHADER_VERTEX) {
-			output_usage_mask =
-				ctx->shader_info->info.vs.output_usage_mask[i];
-		} else {
-			assert(ctx->stage == MESA_SHADER_TESS_EVAL);
-			output_usage_mask =
-				ctx->shader_info->info.tes.output_usage_mask[i];
-		}
-
-		if (i == VARYING_SLOT_CLIP_DIST0)
-			length = util_last_bit(output_usage_mask);
-
 		param_index = shader_io_get_unique_index(i);
 
-		max_output_written = MAX2(param_index + (length > 4), max_output_written);
+		max_output_written = MAX2(param_index, max_output_written);
 	}
 
 	outinfo->esgs_itemsize = (max_output_written + 1) * 16;
@@ -2862,7 +2902,6 @@ handle_es_outputs_post(struct radv_shader_context *ctx,
 		LLVMValueRef *out_ptr = &ctx->abi.outputs[i * 4];
 		unsigned output_usage_mask;
 		int param_index;
-		int length = 4;
 
 		if (!(ctx->output_mask & (1ull << i)))
 			continue;
@@ -2876,9 +2915,6 @@ handle_es_outputs_post(struct radv_shader_context *ctx,
 				ctx->shader_info->info.tes.output_usage_mask[i];
 		}
 
-		if (i == VARYING_SLOT_CLIP_DIST0)
-			length = util_last_bit(output_usage_mask);
-
 		param_index = shader_io_get_unique_index(i);
 
 		if (lds_base) {
@@ -2887,7 +2923,7 @@ handle_es_outputs_post(struct radv_shader_context *ctx,
 			                       "");
 		}
 
-		for (j = 0; j < length; j++) {
+		for (j = 0; j < 4; j++) {
 			if (!(output_usage_mask & (1 << j)))
 				continue;
 
@@ -2924,22 +2960,16 @@ handle_ls_outputs_post(struct radv_shader_context *ctx)
 						 vertex_dw_stride, "");
 
 	for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
-		unsigned output_usage_mask =
-			ctx->shader_info->info.vs.output_usage_mask[i];
 		LLVMValueRef *out_ptr = &ctx->abi.outputs[i * 4];
-		int length = 4;
 
 		if (!(ctx->output_mask & (1ull << i)))
 			continue;
 
-		if (i == VARYING_SLOT_CLIP_DIST0)
-			length = util_last_bit(output_usage_mask);
-
 		int param = shader_io_get_unique_index(i);
 		LLVMValueRef dw_addr = LLVMBuildAdd(ctx->ac.builder, base_dw_addr,
 						    LLVMConstInt(ctx->ac.i32, param * 4, false),
 						    "");
-		for (unsigned j = 0; j < length; j++) {
+		for (unsigned j = 0; j < 4; j++) {
 			LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, out_ptr[j], "");
 			value = ac_to_integer(&ctx->ac, value);
 			value = LLVMBuildZExtOrBitCast(ctx->ac.builder, value, ctx->ac.i32, "");
@@ -3467,10 +3497,17 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
 	ctx.abi.clamp_shadow_reference = false;
 	ctx.abi.gfx9_stride_size_workaround = ctx.ac.chip_class == GFX9 && HAVE_LLVM < 0x800;
 
+	/* Because the new raw/struct atomic intrinsics are buggy with LLVM 8,
+	 * we fallback to the old intrinsics for atomic buffer image operations
+	 * and thus we need to apply the indexing workaround...
+	 */
+	ctx.abi.gfx9_stride_size_workaround_for_atomic = ctx.ac.chip_class == GFX9 && HAVE_LLVM < 0x900;
+
 	if (shader_count >= 2)
 		ac_init_exec_full_mask(&ctx.ac);
 
-	if (ctx.ac.chip_class == GFX9 &&
+	if ((ctx.ac.family == CHIP_VEGA10 ||
+	     ctx.ac.family == CHIP_RAVEN) &&
 	    shaders[shader_count - 1]->info.stage == MESA_SHADER_TESS_CTRL)
 		ac_nir_fixup_ls_hs_input_vgprs(&ctx);
 
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index 138e153f9a4..2526000f56f 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -524,6 +524,14 @@ radv_pipeline_compute_spi_color_formats(struct radv_pipeline *pipeline,
 		col_format |= cf << (4 * i);
 	}
 
+	if (!col_format && blend->need_src_alpha & (1 << 0)) {
+		/* When a subpass doesn't have any color attachments, write the
+		 * alpha channel of MRT0 when alpha coverage is enabled because
+		 * the depth attachment needs it.
+		 */
+		col_format |= V_028714_SPI_SHADER_32_ABGR;
+	}
+
 	/* If the i-th target format is set, all previous target formats must
 	 * be non-zero to avoid hangs.
 	 */
@@ -689,6 +697,7 @@ radv_pipeline_init_blend_state(struct radv_pipeline *pipeline,
 
 	if (vkms && vkms->alphaToCoverageEnable) {
 		blend.db_alpha_to_mask |= S_028B70_ALPHA_TO_MASK_ENABLE(1);
+		blend.need_src_alpha |= 0x1;
 	}
 
 	blend.cb_target_mask = 0;
@@ -1436,11 +1445,13 @@ radv_pipeline_init_dynamic_state(struct radv_pipeline *pipeline,
 
 	const  VkPipelineDiscardRectangleStateCreateInfoEXT *discard_rectangle_info =
 			vk_find_struct_const(pCreateInfo->pNext, PIPELINE_DISCARD_RECTANGLE_STATE_CREATE_INFO_EXT);
-	if (states & RADV_DYNAMIC_DISCARD_RECTANGLE) {
+	if (needed_states & RADV_DYNAMIC_DISCARD_RECTANGLE) {
 		dynamic->discard_rectangle.count = discard_rectangle_info->discardRectangleCount;
-		typed_memcpy(dynamic->discard_rectangle.rectangles,
-		             discard_rectangle_info->pDiscardRectangles,
-		             discard_rectangle_info->discardRectangleCount);
+		if (states & RADV_DYNAMIC_DISCARD_RECTANGLE) {
+			typed_memcpy(dynamic->discard_rectangle.rectangles,
+			             discard_rectangle_info->pDiscardRectangles,
+			             discard_rectangle_info->discardRectangleCount);
+		}
 	}
 
 	pipeline->dynamic_state.mask = states;
@@ -1913,6 +1924,8 @@ radv_generate_graphics_pipeline_key(struct radv_pipeline *pipeline,
 			}
 			key.vertex_alpha_adjust |= adjust << (2 * location);
 		}
+
+		key.vertex_attribute_provided |= 1 << location;
 	}
 
 	if (pCreateInfo->pTessellationState)
@@ -1941,6 +1954,7 @@ radv_fill_shader_keys(struct radv_shader_variant_key *keys,
 {
 	keys[MESA_SHADER_VERTEX].vs.instance_rate_inputs = key->instance_rate_inputs;
 	keys[MESA_SHADER_VERTEX].vs.alpha_adjust = key->vertex_alpha_adjust;
+	keys[MESA_SHADER_VERTEX].vs.vertex_attribute_provided = key->vertex_attribute_provided;
 	for (unsigned i = 0; i < MAX_VERTEX_ATTRIBS; ++i)
 		keys[MESA_SHADER_VERTEX].vs.instance_rate_divisors[i] = key->instance_rate_divisors[i];
 
@@ -3079,13 +3093,17 @@ radv_pipeline_generate_geometry_shader(struct radeon_cmdbuf *ctx_cs,
 	radv_pipeline_generate_hw_vs(ctx_cs, cs, pipeline, pipeline->gs_copy_shader);
 }
 
-static uint32_t offset_to_ps_input(uint32_t offset, bool flat_shade)
+static uint32_t offset_to_ps_input(uint32_t offset, bool flat_shade, bool float16)
 {
 	uint32_t ps_input_cntl;
 	if (offset <= AC_EXP_PARAM_OFFSET_31) {
 		ps_input_cntl = S_028644_OFFSET(offset);
 		if (flat_shade)
 			ps_input_cntl |= S_028644_FLAT_SHADE(1);
+		if (float16) {
+			ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) |
+			                 S_028644_ATTR0_VALID(1);
+		}
 	} else {
 		/* The input is a DEFAULT_VAL constant. */
 		assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 &&
@@ -3110,7 +3128,7 @@ radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf *ctx_cs,
 	if (ps->info.info.ps.prim_id_input) {
 		unsigned vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID];
 		if (vs_offset != AC_EXP_PARAM_UNDEFINED) {
-			ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true);
+			ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true, false);
 			++ps_offset;
 		}
 	}
@@ -3120,9 +3138,9 @@ radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf *ctx_cs,
 	    ps->info.info.needs_multiview_view_index) {
 		unsigned vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_LAYER];
 		if (vs_offset != AC_EXP_PARAM_UNDEFINED)
-			ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true);
+			ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true, false);
 		else
-			ps_input_cntl[ps_offset] = offset_to_ps_input(AC_EXP_PARAM_DEFAULT_VAL_0000, true);
+			ps_input_cntl[ps_offset] = offset_to_ps_input(AC_EXP_PARAM_DEFAULT_VAL_0000, true, false);
 		++ps_offset;
 	}
 
@@ -3138,14 +3156,14 @@ radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf *ctx_cs,
 
 		vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_CLIP_DIST0];
 		if (vs_offset != AC_EXP_PARAM_UNDEFINED) {
-			ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, false);
+			ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, false, false);
 			++ps_offset;
 		}
 
 		vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_CLIP_DIST1];
 		if (vs_offset != AC_EXP_PARAM_UNDEFINED &&
 		    ps->info.info.ps.num_input_clips_culls > 4) {
-			ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, false);
+			ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, false, false);
 			++ps_offset;
 		}
 	}
@@ -3153,6 +3171,7 @@ radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf *ctx_cs,
 	for (unsigned i = 0; i < 32 && (1u << i) <= ps->info.fs.input_mask; ++i) {
 		unsigned vs_offset;
 		bool flat_shade;
+		bool float16;
 		if (!(ps->info.fs.input_mask & (1u << i)))
 			continue;
 
@@ -3164,8 +3183,9 @@ radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf *ctx_cs,
 		}
 
 		flat_shade = !!(ps->info.fs.flat_shaded_mask & (1u << ps_offset));
+		float16 = !!(ps->info.fs.float16_shaded_mask & (1u << ps_offset));
 
-		ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, flat_shade);
+		ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, flat_shade, float16);
 		++ps_offset;
 	}
 
@@ -3192,11 +3212,11 @@ radv_compute_db_shader_control(const struct radv_device *device,
 	bool disable_rbplus = device->physical_device->has_rbplus &&
 	                      !device->physical_device->rbplus_allowed;
 
-	/* Do not enable the gl_SampleMask fragment shader output if MSAA is
-	 * disabled.
+	/* It shouldn't be needed to export gl_SampleMask when MSAA is disabled
+	 * but this appears to break Project Cars (DXVK). See
+	 * https://bugs.freedesktop.org/show_bug.cgi?id=109401
 	 */
-	bool mask_export_enable = ms->num_samples > 1 &&
-				  ps->info.info.ps.writes_sample_mask;
+	bool mask_export_enable = ps->info.info.ps.writes_sample_mask;
 
 	return  S_02880C_Z_EXPORT_ENABLE(ps->info.info.ps.writes_z) |
 		S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(ps->info.info.ps.writes_stencil) |
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index 85c18906f84..ea957ae6dab 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -306,6 +306,9 @@ struct radv_physical_device {
 	/* Whether DCC should be enabled for MSAA textures. */
 	bool dcc_msaa_allowed;
 
+	/* Whether LOAD_CONTEXT_REG packets are supported. */
+	bool has_load_ctx_reg_pkt;
+
 	/* This is the drivers on-disk cache used as a fallback as opposed to
 	 * the pipeline cache defined by apps.
 	 */
@@ -362,6 +365,7 @@ struct radv_pipeline_cache {
 struct radv_pipeline_key {
 	uint32_t instance_rate_inputs;
 	uint32_t instance_rate_divisors[MAX_VERTEX_ATTRIBS];
+	uint32_t vertex_attribute_provided;
 	uint64_t vertex_alpha_adjust;
 	unsigned tess_input_vertices;
 	uint32_t col_format;
@@ -1144,6 +1148,7 @@ void si_write_scissors(struct radeon_cmdbuf *cs, int first,
 		       const VkViewport *viewports, bool can_use_guardband);
 uint32_t si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer,
 				   bool instanced_draw, bool indirect_draw,
+				   bool count_from_stream_output,
 				   uint32_t draw_vertex_count);
 void si_cs_emit_write_event_eop(struct radeon_cmdbuf *cs,
 				enum chip_class chip_class,
@@ -1462,6 +1467,7 @@ bool radv_format_pack_clear_color(VkFormat format,
 bool radv_is_colorbuffer_format_supported(VkFormat format, bool *blendable);
 bool radv_dcc_formats_compatible(VkFormat format1,
                                  VkFormat format2);
+bool radv_device_supports_etc(struct radv_physical_device *physical_device);
 
 struct radv_fmask_info {
 	uint64_t offset;
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index 32cd9ae25e9..ec571e2f8c5 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -159,7 +159,7 @@ radv_optimize_nir(struct nir_shader *shader, bool optimize_conservatively,
                 NIR_PASS(progress, shader, nir_opt_if);
                 NIR_PASS(progress, shader, nir_opt_dead_cf);
                 NIR_PASS(progress, shader, nir_opt_cse);
-                NIR_PASS(progress, shader, nir_opt_peephole_select, 8, true, true);
+                NIR_PASS(progress, shader, nir_opt_peephole_select, 8, true);
                 NIR_PASS(progress, shader, nir_opt_algebraic);
                 NIR_PASS(progress, shader, nir_opt_constant_folding);
                 NIR_PASS(progress, shader, nir_opt_undef);
@@ -222,6 +222,8 @@ radv_shader_compile_to_nir(struct radv_device *device,
 			.lower_ubo_ssbo_access_to_offsets = true,
 			.caps = {
 				.descriptor_array_dynamic_indexing = true,
+				.descriptor_array_non_uniform_indexing = true,
+				.descriptor_indexing = true,
 				.device_group = true,
 				.draw_parameters = true,
 				.float64 = true,
@@ -610,6 +612,8 @@ shader_variant_create(struct radv_device *device,
 		tm_options |= AC_TM_SISCHED;
 	if (options->check_ir)
 		tm_options |= AC_TM_CHECK_IR;
+	if (device->instance->debug_flags & RADV_DEBUG_NO_LOAD_STORE_OPT)
+		tm_options |= AC_TM_NO_LOAD_STORE_OPT;
 
 	thread_compiler = !(device->instance->debug_flags & RADV_DEBUG_NOTHREADLLVM);
 	radv_init_llvm_once();
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
index 3652a811e80..f6f9dd2bbf1 100644
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@@ -66,6 +66,9 @@ struct radv_vs_variant_key {
 	uint32_t instance_rate_inputs;
 	uint32_t instance_rate_divisors[MAX_VERTEX_ATTRIBS];
 
+	/* Mask of vertex attributes that are provided by the pipeline. */
+	uint32_t vertex_attribute_provided;
+
 	/* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
 	 * so we may need to fix it up. */
 	uint64_t alpha_adjust;
@@ -257,6 +260,7 @@ struct radv_shader_variant_info {
 			unsigned num_interp;
 			uint32_t input_mask;
 			uint32_t flat_shaded_mask;
+			uint32_t float16_shaded_mask;
 			bool can_discard;
 			bool early_fragment_test;
 		} fs;
@@ -401,6 +405,8 @@ static inline unsigned shader_io_get_unique_index(gl_varying_slot slot)
 		return 1;
 	if (slot == VARYING_SLOT_CLIP_DIST0)
 		return 2;
+	if (slot == VARYING_SLOT_CLIP_DIST1)
+		return 3;
 	/* 3 is reserved for clip dist as well */
 	if (slot >= VARYING_SLOT_VAR0 && slot <= VARYING_SLOT_VAR31)
 		return 4 + (slot - VARYING_SLOT_VAR0);
diff --git a/src/amd/vulkan/radv_shader_info.c b/src/amd/vulkan/radv_shader_info.c
index 7e5a3789af2..fdc4f52086b 100644
--- a/src/amd/vulkan/radv_shader_info.c
+++ b/src/amd/vulkan/radv_shader_info.c
@@ -101,7 +101,7 @@ gather_intrinsic_load_deref_info(const nir_shader *nir,
 	case MESA_SHADER_VERTEX: {
 		nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
 
-		if (var->data.mode == nir_var_shader_in) {
+		if (var && var->data.mode == nir_var_shader_in) {
 			unsigned idx = var->data.location;
 			uint8_t mask = nir_ssa_def_components_read(&instr->dest.ssa);
 
@@ -115,6 +115,15 @@ gather_intrinsic_load_deref_info(const nir_shader *nir,
 	}
 }
 
+static uint32_t
+widen_writemask(uint32_t wrmask)
+{
+	uint32_t new_wrmask = 0;
+	for(unsigned i = 0; i < 4; i++)
+		new_wrmask |= (wrmask & (1 << i) ? 0x3 : 0x0) << (i * 2);
+	return new_wrmask;
+}
+
 static void
 set_output_usage_mask(const nir_shader *nir, const nir_intrinsic_instr *instr,
 		      uint8_t *output_usage_mask)
@@ -122,25 +131,27 @@ set_output_usage_mask(const nir_shader *nir, const nir_intrinsic_instr *instr,
 	nir_deref_instr *deref_instr =
 		nir_instr_as_deref(instr->src[0].ssa->parent_instr);
 	nir_variable *var = nir_deref_instr_get_variable(deref_instr);
-	unsigned attrib_count = glsl_count_attribute_slots(var->type, false);
+	unsigned attrib_count = glsl_count_attribute_slots(deref_instr->type, false);
 	unsigned idx = var->data.location;
 	unsigned comp = var->data.location_frac;
 	unsigned const_offset = 0;
 
 	get_deref_offset(deref_instr, &const_offset);
 
-	if (idx == VARYING_SLOT_CLIP_DIST0) {
-		/* Special case for clip/cull distances because there are
-		 * combined into a single array that contains both.
-		 */
-		output_usage_mask[idx] |= 1 << const_offset;
+	if (var->data.compact) {
+		assert(!glsl_type_is_64bit(deref_instr->type));
+		const_offset += comp;
+		output_usage_mask[idx + const_offset / 4] |= 1 << (const_offset % 4);
 		return;
 	}
 
-	for (unsigned i = 0; i < attrib_count; i++) {
+	uint32_t wrmask = nir_intrinsic_write_mask(instr);
+	if (glsl_type_is_64bit(deref_instr->type))
+		wrmask = widen_writemask(wrmask);
+
+	for (unsigned i = 0; i < attrib_count; i++)
 		output_usage_mask[idx + i + const_offset] |=
-			instr->const_index[0] << comp;
-	}
+			((wrmask >> (i * 4)) & 0xf) << comp;
 }
 
 static void
@@ -150,7 +161,7 @@ gather_intrinsic_store_deref_info(const nir_shader *nir,
 {
 	nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
 
-	if (var->data.mode == nir_var_shader_out) {
+	if (var && var->data.mode == nir_var_shader_out) {
 		unsigned idx = var->data.location;
 
 		switch (nir->info.stage) {
@@ -174,13 +185,9 @@ gather_intrinsic_store_deref_info(const nir_shader *nir,
 				type = glsl_get_array_element(var->type);
 
 			unsigned slots =
-				var->data.compact ? DIV_ROUND_UP(glsl_get_length(type), 4)
+				var->data.compact ? DIV_ROUND_UP(var->data.location_frac + glsl_get_length(type), 4)
 						  : glsl_count_attribute_slots(type, false);
 
-			if (idx == VARYING_SLOT_CLIP_DIST0)
-				slots = (nir->info.clip_distance_array_size +
-					 nir->info.cull_distance_array_size > 4) ? 2 : 1;
-
 			mark_tess_output(info, var->data.patch, param, slots);
 			break;
 		}
@@ -374,7 +381,8 @@ gather_info_input_decl_ps(const nir_shader *nir, const nir_variable *var,
 		info->ps.layer_input = true;
 		break;
 	case VARYING_SLOT_CLIP_DIST0:
-		info->ps.num_input_clips_culls = attrib_count;
+	case VARYING_SLOT_CLIP_DIST1:
+		info->ps.num_input_clips_culls += attrib_count;
 		break;
 	default:
 		break;
@@ -409,8 +417,8 @@ gather_info_output_decl_ls(const nir_shader *nir, const nir_variable *var,
 	int idx = var->data.location;
 	unsigned param = shader_io_get_unique_index(idx);
 	int num_slots = glsl_count_attribute_slots(var->type, false);
-	if (idx == VARYING_SLOT_CLIP_DIST0)
-		num_slots = (nir->info.clip_distance_array_size + nir->info.cull_distance_array_size > 4) ? 2 : 1;
+	if (var->data.compact)
+		num_slots = DIV_ROUND_UP(var->data.location_frac + glsl_get_length(var->type), 4);
 	mark_ls_output(info, param, num_slots);
 }
 
diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c
index e75c6d127d6..e73c13762e5 100644
--- a/src/amd/vulkan/si_cmd_buffer.c
+++ b/src/amd/vulkan/si_cmd_buffer.c
@@ -561,6 +561,7 @@ radv_prims_for_vertices(struct radv_prim_vertex_count *info, unsigned num)
 uint32_t
 si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer,
 			  bool instanced_draw, bool indirect_draw,
+			  bool count_from_stream_output,
 			  uint32_t draw_vertex_count)
 {
 	enum chip_class chip_class = cmd_buffer->device->physical_device->rad_info.chip_class;
@@ -622,6 +623,12 @@ si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer,
 		    (instanced_draw || indirect_draw))
 			partial_vs_wave = true;
 
+		/* Hardware requirement when drawing primitives from a stream
+		 * output buffer.
+		 */
+		if (count_from_stream_output)
+			wd_switch_on_eop = true;
+
 		/* If the WD switch is false, the IA switch must be false too. */
 		assert(wd_switch_on_eop || !ia_switch_on_eop);
 	}
diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
index d3b1e2cd4c6..49a86a72c31 100644
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
@@ -543,7 +543,7 @@ static void radv_amdgpu_cs_add_buffer_internal(struct radv_amdgpu_cs *cs,
 	cs->handles[cs->num_buffers].bo_handle = bo;
 	cs->handles[cs->num_buffers].bo_priority = priority;
 
-	hash = ((uintptr_t)bo >> 6) & (ARRAY_SIZE(cs->buffer_hash_table) - 1);
+	hash = bo & (ARRAY_SIZE(cs->buffer_hash_table) - 1);
 	cs->buffer_hash_table[hash] = cs->num_buffers;
 
 	++cs->num_buffers;
@@ -665,6 +665,7 @@ static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws,
 			assert(num < ws->num_buffers);
 			handles[num].bo_handle = bo->bo_handle;
 			handles[num].bo_priority = bo->priority;
+			num++;
 		}
 
 		r = amdgpu_bo_list_create_raw(ws->dev, ws->num_buffers,
diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys_public.h b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys_public.h
index 854e216551f..709669b2a57 100644
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys_public.h
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys_public.h
@@ -29,6 +29,13 @@
 #ifndef RADV_AMDGPU_WINSYS_PUBLIC_H
 #define RADV_AMDGPU_WINSYS_PUBLIC_H
 
+/* The number of IBs per submit isn't infinite, it depends on the ring type
+ * (ie. some initial setup needed for a submit) and the number of IBs (4 DW).
+ * This limit is arbitrary but should be safe for now.  Ideally, we should get
+ * this limit from the KMD.
+*/
+#define RADV_MAX_IBS_PER_SUBMIT 192
+
 struct radeon_winsys *radv_amdgpu_winsys_create(int fd, uint64_t debug_flags,
 						uint64_t perftest_flags);
 
diff --git a/src/broadcom/cle/v3d_packet_v33.xml b/src/broadcom/cle/v3d_packet_v33.xml
index 754461dc067..06e8ddad7ec 100644
--- a/src/broadcom/cle/v3d_packet_v33.xml
+++ b/src/broadcom/cle/v3d_packet_v33.xml
@@ -820,8 +820,8 @@
 
   <packet code="120" name="Tile Binning Mode Cfg" min_ver="41">
 
-    <field name="Height (in pixels)" size="12" start="48" type="uint" minus_one="true"/>
-    <field name="Width (in pixels)" size="12" start="32" type="uint" minus_one="true"/>
+    <field name="Height (in pixels)" size="16" start="48" type="uint" minus_one="true"/>
+    <field name="Width (in pixels)" size="16" start="32" type="uint" minus_one="true"/>
 
     <field name="Double-buffer in non-ms mode" size="1" start="15" type="bool"/>
     <field name="Multisample Mode (4x)" size="1" start="14" type="bool"/>
diff --git a/src/broadcom/common/v3d_cpu_tiling.h b/src/broadcom/common/v3d_cpu_tiling.h
index e10b4586609..cb1ee7c96f4 100644
--- a/src/broadcom/common/v3d_cpu_tiling.h
+++ b/src/broadcom/common/v3d_cpu_tiling.h
@@ -159,9 +159,8 @@ v3d_store_utile(void *gpu, uint32_t gpu_stride,
                          * d0-d7.
                          */
                         "vstm %[gpu], {q0, q1, q2, q3}\n"
-                        :
+                        : [cpu]         "+r"(cpu)
                         : [gpu]         "r"(gpu),
-                          [cpu]         "r"(cpu),
                           [cpu_stride]  "r"(cpu_stride)
                         : "q0", "q1", "q2", "q3");
                 return;
diff --git a/src/broadcom/common/v3d_limits.h b/src/broadcom/common/v3d_limits.h
index ee7a3e6bc00..e21ee246eff 100644
--- a/src/broadcom/common/v3d_limits.h
+++ b/src/broadcom/common/v3d_limits.h
@@ -32,7 +32,8 @@
  */
 #define V3D_MAX_TEXTURE_SAMPLERS 16
 
-#define V3D_MAX_MIP_LEVELS 12
+/* The HW can do 16384 (15), but we run into hangs when we expose that. */
+#define V3D_MAX_MIP_LEVELS 13
 
 #define V3D_MAX_SAMPLES 4
 
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index a5e75f650e8..bd19bb9b0b6 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -121,7 +121,7 @@ vir_emit_thrsw(struct v3d_compile *c)
          */
         c->last_thrsw = vir_NOP(c);
         c->last_thrsw->qpu.sig.thrsw = true;
-        c->last_thrsw_at_top_level = (c->execute.file == QFILE_NULL);
+        c->last_thrsw_at_top_level = !c->in_control_flow;
 }
 
 static uint32_t
@@ -1158,7 +1158,9 @@ emit_frag_end(struct v3d_compile *c)
 
                 inst->src[vir_get_implicit_uniform_src(inst)] =
                         vir_uniform_ui(c, tlb_specifier | 0xffffff00);
+                c->writes_z = true;
         } else if (c->s->info.fs.uses_discard ||
+                   !c->s->info.fs.early_fragment_tests ||
                    c->fs_key->sample_alpha_to_coverage ||
                    !has_any_tlb_color_write) {
                 /* Emit passthrough Z if it needed to be delayed until shader
@@ -1188,6 +1190,7 @@ emit_frag_end(struct v3d_compile *c)
 
                 inst->src[vir_get_implicit_uniform_src(inst)] =
                         vir_uniform_ui(c, tlb_specifier | 0xffffff00);
+                c->writes_z = true;
         }
 
         /* XXX: Performance improvement: Merge Z write and color writes TLB
@@ -1455,7 +1458,7 @@ v3d_optimize_nir(struct nir_shader *s)
                 NIR_PASS(progress, s, nir_opt_dce);
                 NIR_PASS(progress, s, nir_opt_dead_cf);
                 NIR_PASS(progress, s, nir_opt_cse);
-                NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);
+                NIR_PASS(progress, s, nir_opt_peephole_select, 8, true);
                 NIR_PASS(progress, s, nir_opt_algebraic);
                 NIR_PASS(progress, s, nir_opt_constant_folding);
                 NIR_PASS(progress, s, nir_opt_undef);
@@ -2103,10 +2106,10 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
         else
                 else_block = vir_new_block(c);
 
-        bool was_top_level = false;
+        bool was_uniform_control_flow = false;
         if (c->execute.file == QFILE_NULL) {
                 c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
-                was_top_level = true;
+                was_uniform_control_flow = true;
         }
 
         /* Set up the flags for the IF condition (taking the THEN branch). */
@@ -2122,7 +2125,7 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
         /* Update the flags+cond to mean "Taking the ELSE branch (!cond) and
          * was previously active (execute Z) for updating the exec flags.
          */
-        if (was_top_level) {
+        if (was_uniform_control_flow) {
                 cond = v3d_qpu_cond_invert(cond);
         } else {
                 struct qinst *inst = vir_MOV_dest(c, vir_reg(QFILE_NULL, 0),
@@ -2176,7 +2179,7 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
         vir_link_blocks(c->cur_block, after_block);
 
         vir_set_emit_block(c, after_block);
-        if (was_top_level)
+        if (was_uniform_control_flow)
                 c->execute = c->undef;
         else
                 ntq_activate_execute_for_block(c);
@@ -2185,12 +2188,15 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
 static void
 ntq_emit_if(struct v3d_compile *c, nir_if *nif)
 {
+        bool was_in_control_flow = c->in_control_flow;
+        c->in_control_flow = true;
         if (c->execute.file == QFILE_NULL &&
             nir_src_is_dynamically_uniform(nif->condition)) {
                 ntq_emit_uniform_if(c, nif);
         } else {
                 ntq_emit_nonuniform_if(c, nif);
         }
+        c->in_control_flow = was_in_control_flow;
 }
 
 static void
@@ -2267,10 +2273,13 @@ static void ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
 static void
 ntq_emit_loop(struct v3d_compile *c, nir_loop *loop)
 {
-        bool was_top_level = false;
+        bool was_in_control_flow = c->in_control_flow;
+        c->in_control_flow = true;
+
+        bool was_uniform_control_flow = false;
         if (c->execute.file == QFILE_NULL) {
                 c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
-                was_top_level = true;
+                was_uniform_control_flow = true;
         }
 
         struct qblock *save_loop_cont_block = c->loop_cont_block;
@@ -2307,7 +2316,7 @@ ntq_emit_loop(struct v3d_compile *c, nir_loop *loop)
         vir_link_blocks(c->cur_block, c->loop_break_block);
 
         vir_set_emit_block(c, c->loop_break_block);
-        if (was_top_level)
+        if (was_uniform_control_flow)
                 c->execute = c->undef;
         else
                 ntq_activate_execute_for_block(c);
@@ -2316,6 +2325,8 @@ ntq_emit_loop(struct v3d_compile *c, nir_loop *loop)
         c->loop_cont_block = save_loop_cont_block;
 
         c->loops++;
+
+        c->in_control_flow = was_in_control_flow;
 }
 
 static void
diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
index 127b04136d1..671aba3c551 100644
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -519,6 +519,7 @@ struct v3d_compile {
         uint32_t centroid_flags[BITSET_WORDS(V3D_MAX_FS_INPUTS)];
 
         bool uses_center_w;
+        bool writes_z;
 
         struct v3d_ubo_range *ubo_ranges;
         bool *ubo_range_used;
@@ -531,6 +532,7 @@ struct v3d_compile {
          * yes, otherwise a block number + 1 that the channel jumped to.
          */
         struct qreg execute;
+        bool in_control_flow;
 
         struct qreg line_x, point_x, point_y;
 
@@ -716,7 +718,7 @@ struct v3d_fs_prog_data {
         uint32_t centroid_flags[((V3D_MAX_FS_INPUTS - 1) / 24) + 1];
 
         bool writes_z;
-        bool discard;
+        bool disable_ez;
         bool uses_center_w;
 };
 
diff --git a/src/broadcom/compiler/v3d_nir_lower_image_load_store.c b/src/broadcom/compiler/v3d_nir_lower_image_load_store.c
index e74206b3949..2aa3cbad495 100644
--- a/src/broadcom/compiler/v3d_nir_lower_image_load_store.c
+++ b/src/broadcom/compiler/v3d_nir_lower_image_load_store.c
@@ -156,7 +156,7 @@ pack_sint(nir_builder *b, nir_ssa_def *color, const unsigned *bits,
           int num_components)
 {
         color = nir_channels(b, color, (1 << num_components) - 1);
-        color = nir_format_clamp_uint(b, color, bits);
+        color = nir_format_clamp_sint(b, color, bits);
         return pack_bits(b, color, bits, num_components, true);
 }
 
diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
index 10105fbd861..20f7004149c 100644
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@@ -777,21 +777,9 @@ v3d_fs_set_prog_data(struct v3d_compile *c,
                      struct v3d_fs_prog_data *prog_data)
 {
         v3d_set_fs_prog_data_inputs(c, prog_data);
-        prog_data->writes_z = (c->s->info.outputs_written &
-                               (1 << FRAG_RESULT_DEPTH));
-        prog_data->discard = (c->s->info.fs.uses_discard ||
-                              c->fs_key->sample_alpha_to_coverage);
+        prog_data->writes_z = c->writes_z;
+        prog_data->disable_ez = !c->s->info.fs.early_fragment_tests;
         prog_data->uses_center_w = c->uses_center_w;
-
-        /* If the shader has some side effects and hasn't allowed early
-         * fragment tests, disable them.
-         */
-        if (!c->s->info.fs.early_fragment_tests &&
-            (c->s->info.num_images ||
-             c->s->info.num_ssbos ||
-             c->s->info.num_abos)) {
-                prog_data->discard = true;
-        }
 }
 
 static void
@@ -888,6 +876,15 @@ v3d_nir_lower_fs_early(struct v3d_compile *c)
 {
         if (c->fs_key->int_color_rb || c->fs_key->uint_color_rb)
                 v3d_fixup_fs_output_types(c);
+
+        /* If the shader has no non-TLB side effects, we can promote it to
+         * enabling early_fragment_tests even if the user didn't.
+         */
+        if (!(c->s->info.num_images ||
+              c->s->info.num_ssbos ||
+              c->s->info.num_abos)) {
+                c->s->info.fs.early_fragment_tests = true;
+        }
 }
 
 static void
diff --git a/src/compiler/Android.glsl.gen.mk b/src/compiler/Android.glsl.gen.mk
index e31eb6f101f..3b94ea7bd2f 100644
--- a/src/compiler/Android.glsl.gen.mk
+++ b/src/compiler/Android.glsl.gen.mk
@@ -104,6 +104,6 @@ $(intermediates)/glsl/ir_expression_operation_strings.h: $(LOCAL_PATH)/glsl/ir_e
 	@mkdir -p $(dir $@)
 	$(hide) $(MESA_PYTHON2) $< strings > $@
 
-$(intermediates)/compiler/glsl/float64_glsl.h: $(LOCAL_PATH)/glsl/xxd.py
+$(intermediates)/glsl/float64_glsl.h: $(LOCAL_PATH)/glsl/xxd.py
 	@mkdir -p $(dir $@)
 	$(hide) $(MESA_PYTHON2) $< $(MESA_TOP)/src/compiler/glsl/float64.glsl $@ -n float64_source > $@
diff --git a/src/compiler/Android.glsl.mk b/src/compiler/Android.glsl.mk
index 0aabafa2673..37b3cb80251 100644
--- a/src/compiler/Android.glsl.mk
+++ b/src/compiler/Android.glsl.mk
@@ -48,7 +48,7 @@ LOCAL_STATIC_LIBRARIES := \
 	libmesa_nir
 
 LOCAL_MODULE := libmesa_glsl
-
+LOCAL_CFLAGS += -Wno-error
 include $(LOCAL_PATH)/Android.glsl.gen.mk
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
diff --git a/src/compiler/Android.nir.mk b/src/compiler/Android.nir.mk
index 75a247a245d..59da5dbdc1c 100644
--- a/src/compiler/Android.nir.mk
+++ b/src/compiler/Android.nir.mk
@@ -41,6 +41,9 @@ LOCAL_C_INCLUDES := \
 	$(MESA_TOP)/src/gallium/include \
 	$(MESA_TOP)/src/gallium/auxiliary
 
+LOCAL_CFLAGS := \
+        -Wno-missing-braces
+
 LOCAL_STATIC_LIBRARIES := libmesa_compiler
 
 LOCAL_MODULE := libmesa_nir
diff --git a/src/compiler/Makefile.sources b/src/compiler/Makefile.sources
index 489022a22a1..0b40c3c6ebe 100644
--- a/src/compiler/Makefile.sources
+++ b/src/compiler/Makefile.sources
@@ -229,6 +229,7 @@ NIR_FILES = \
 	nir/nir_lower_alpha_test.c \
 	nir/nir_lower_alu.c \
 	nir/nir_lower_alu_to_scalar.c \
+	nir/nir_lower_array_deref_of_vec.c \
 	nir/nir_lower_atomics_to_ssbo.c \
 	nir/nir_lower_bitmap.c \
 	nir/nir_lower_bit_size.c \
@@ -251,6 +252,7 @@ NIR_FILES = \
 	nir/nir_lower_io_arrays_to_elements.c \
 	nir/nir_lower_io_to_temporaries.c \
 	nir/nir_lower_io_to_scalar.c \
+	nir/nir_lower_io_to_vector.c \
 	nir/nir_lower_packing.c \
 	nir/nir_lower_passthrough_edgeflags.c \
 	nir/nir_lower_patch_vertices.c \
diff --git a/src/compiler/glsl/ast_to_hir.cpp b/src/compiler/glsl/ast_to_hir.cpp
index 620153e6a34..8c707265e44 100644
--- a/src/compiler/glsl/ast_to_hir.cpp
+++ b/src/compiler/glsl/ast_to_hir.cpp
@@ -3698,6 +3698,10 @@ apply_layout_qualifier_to_variable(const struct ast_type_qualifier *qual,
                                 "cannot be applied to a matrix, a structure, "
                                 "a block, or an array containing any of "
                                 "these.");
+            } else if (components > 4 && type->is_64bit()) {
+               _mesa_glsl_error(loc, state, "component layout qualifier "
+                                "cannot be applied to dvec%u.",
+                                components / 2);
             } else if (qual_component != 0 &&
                 (qual_component + components - 1) > 3) {
                _mesa_glsl_error(loc, state, "component overflow (%u > 3)",
@@ -3940,7 +3944,8 @@ apply_type_qualifier_to_variable(const struct ast_type_qualifier *qual,
                           "`invariant' after being used",
                           var->name);
       } else {
-         var->data.invariant = 1;
+         var->data.explicit_invariant = true;
+         var->data.invariant = true;
       }
    }
 
@@ -4148,8 +4153,10 @@ apply_type_qualifier_to_variable(const struct ast_type_qualifier *qual,
       }
    }
 
-   if (state->all_invariant && var->data.mode == ir_var_shader_out)
+   if (state->all_invariant && var->data.mode == ir_var_shader_out) {
+      var->data.explicit_invariant = true;
       var->data.invariant = true;
+   }
 
    var->data.interpolation =
       interpret_interpolation_qualifier(qual, var->type,
@@ -4857,6 +4864,7 @@ ast_declarator_list::hir(exec_list *instructions,
                             "`invariant' after being used",
                             earlier->name);
          } else {
+            earlier->data.explicit_invariant = true;
             earlier->data.invariant = true;
          }
       }
diff --git a/src/compiler/glsl/gl_nir_lower_samplers_as_deref.c b/src/compiler/glsl/gl_nir_lower_samplers_as_deref.c
index 719968a6671..87718112db7 100644
--- a/src/compiler/glsl/gl_nir_lower_samplers_as_deref.c
+++ b/src/compiler/glsl/gl_nir_lower_samplers_as_deref.c
@@ -147,10 +147,20 @@ lower_deref(nir_builder *b, struct lower_samplers_as_deref_state *state,
 
    remove_struct_derefs_prep(path.path, &name, &location, &type);
 
-   assert(location < state->shader_program->data->NumUniformStorage &&
-          state->shader_program->data->UniformStorage[location].opaque[stage].active);
+   if (state->shader_program && var->data.how_declared != nir_var_hidden) {
+      /* For GLSL programs, look up the bindings in the uniform storage. */
+      assert(location < state->shader_program->data->NumUniformStorage &&
+             state->shader_program->data->UniformStorage[location].opaque[stage].active);
 
-   binding = state->shader_program->data->UniformStorage[location].opaque[stage].index;
+      binding = state->shader_program->data->UniformStorage[location].opaque[stage].index;
+   } else {
+      /* For ARB programs, built-in shaders, or internally generated sampler
+       * variables in GLSL programs, assume that whoever created the shader
+       * set the bindings correctly already.
+       */
+      assert(var->data.explicit_binding);
+      binding = var->data.binding;
+   }
 
    if (var->type == type) {
       /* Fast path: We did not encounter any struct derefs. */
@@ -167,6 +177,14 @@ lower_deref(nir_builder *b, struct lower_samplers_as_deref_state *state,
    } else {
       var = nir_variable_create(state->shader, nir_var_uniform, type, name);
       var->data.binding = binding;
+
+      /* Don't set var->data.location.  The old structure location could be
+       * used to index into gl_uniform_storage, assuming the full structure
+       * was walked in order.  With the new split variables, this invariant
+       * no longer holds and there's no meaningful way to start from a base
+       * location and access a particular array element.  Just leave it 0.
+       */
+
       _mesa_hash_table_insert_pre_hashed(state->remap_table, hash, name, var);
    }
 
diff --git a/src/compiler/glsl/glcpp/glcpp-parse.y b/src/compiler/glsl/glcpp/glcpp-parse.y
index 1c095cb66f9..c951d9526ac 100644
--- a/src/compiler/glsl/glcpp/glcpp-parse.y
+++ b/src/compiler/glsl/glcpp/glcpp-parse.y
@@ -224,10 +224,12 @@ expanded_line:
 			glcpp_error(& @1, parser, "undefined macro %s in expression (illegal in GLES)", $2.undefined_macro);
 		_glcpp_parser_skip_stack_change_if (parser, & @1, "elif", $2.value);
 	}
-|	LINE_EXPANDED integer_constant NEWLINE {
+|	LINE_EXPANDED expression NEWLINE {
+		if (parser->is_gles && $2.undefined_macro)
+			glcpp_error(& @1, parser, "undefined macro %s in expression (illegal in GLES)", $2.undefined_macro);
 		parser->has_new_line_number = 1;
-		parser->new_line_number = $2;
-		_mesa_string_buffer_printf(parser->output, "#line %" PRIiMAX "\n", $2);
+		parser->new_line_number = $2.value;
+		_mesa_string_buffer_printf(parser->output, "#line %" PRIiMAX "\n", $2.value);
 	}
 |	LINE_EXPANDED integer_constant integer_constant NEWLINE {
 		parser->has_new_line_number = 1;
@@ -238,6 +240,17 @@ expanded_line:
 					   "#line %" PRIiMAX " %" PRIiMAX "\n",
 					    $2, $3);
 	}
+|	LINE_EXPANDED '(' expression ')' '(' expression ')' NEWLINE {
+		if (parser->is_gles && $3.undefined_macro)
+			glcpp_error(& @1, parser, "undefined macro %s in expression (illegal in GLES)", $3.undefined_macro);
+		if (parser->is_gles && $6.undefined_macro)
+			glcpp_error(& @1, parser, "undefined macro %s in expression (illegal in GLES)", $6.undefined_macro);
+		parser->has_new_line_number = 1;
+		parser->new_line_number = $3.value;
+		parser->has_new_source_number = 1;
+		parser->new_source_number = $6.value;
+		_mesa_string_buffer_printf(parser->output, "#line %" PRIiMAX " %" PRIiMAX "\n", $3.value, $6.value);
+	}
 ;
 
 define:
diff --git a/src/compiler/glsl/glsl_to_nir.cpp b/src/compiler/glsl/glsl_to_nir.cpp
index d2db0f95aca..47fc2fea160 100644
--- a/src/compiler/glsl/glsl_to_nir.cpp
+++ b/src/compiler/glsl/glsl_to_nir.cpp
@@ -353,6 +353,12 @@ nir_visitor::visit(ir_variable *ir)
               ir->data.location == VARYING_SLOT_TESS_LEVEL_OUTER)) {
             var->data.compact = ir->type->without_array()->is_scalar();
          }
+
+         if (shader->info.stage > MESA_SHADER_VERTEX &&
+             ir->data.location >= VARYING_SLOT_CLIP_DIST0 &&
+             ir->data.location <= VARYING_SLOT_CULL_DIST1) {
+            var->data.compact = ir->type->without_array()->is_scalar();
+         }
       }
       break;
 
@@ -363,6 +369,12 @@ nir_visitor::visit(ir_variable *ir)
            ir->data.location == VARYING_SLOT_TESS_LEVEL_OUTER)) {
          var->data.compact = ir->type->without_array()->is_scalar();
       }
+
+      if (shader->info.stage <= MESA_SHADER_GEOMETRY &&
+          ir->data.location >= VARYING_SLOT_CLIP_DIST0 &&
+          ir->data.location <= VARYING_SLOT_CULL_DIST1) {
+         var->data.compact = ir->type->without_array()->is_scalar();
+      }
       break;
 
    case ir_var_uniform:
diff --git a/src/compiler/glsl/ir.cpp b/src/compiler/glsl/ir.cpp
index 1d1a56ae9a5..f5aa1be4e20 100644
--- a/src/compiler/glsl/ir.cpp
+++ b/src/compiler/glsl/ir.cpp
@@ -1734,6 +1734,7 @@ ir_variable::ir_variable(const struct glsl_type *type, const char *name,
    this->data.centroid = false;
    this->data.sample = false;
    this->data.patch = false;
+   this->data.explicit_invariant = false;
    this->data.invariant = false;
    this->data.how_declared = ir_var_declared_normally;
    this->data.mode = mode;
diff --git a/src/compiler/glsl/ir.h b/src/compiler/glsl/ir.h
index d05d1998a50..8b32ed8209a 100644
--- a/src/compiler/glsl/ir.h
+++ b/src/compiler/glsl/ir.h
@@ -233,7 +233,7 @@ class ir_rvalue : public ir_instruction {
 
    ir_rvalue *as_rvalue_to_saturate();
 
-   virtual bool is_lvalue(const struct _mesa_glsl_parse_state *state = NULL) const
+   virtual bool is_lvalue(const struct _mesa_glsl_parse_state * = NULL) const
    {
       return false;
    }
@@ -657,6 +657,19 @@ class ir_variable : public ir_instruction {
       unsigned centroid:1;
       unsigned sample:1;
       unsigned patch:1;
+      /**
+       * Was an 'invariant' qualifier explicitly set in the shader?
+       *
+       * This is used to cross validate qualifiers.
+       */
+      unsigned explicit_invariant:1;
+      /**
+       * Is the variable invariant?
+       *
+       * It can happen either by having the 'invariant' qualifier
+       * explicitly set in the shader or by being used in calculations
+       * of other invariant variables.
+       */
       unsigned invariant:1;
       unsigned precise:1;
 
diff --git a/src/compiler/glsl/ir_print_visitor.cpp b/src/compiler/glsl/ir_print_visitor.cpp
index ef6bca1229e..b055d25d60d 100644
--- a/src/compiler/glsl/ir_print_visitor.cpp
+++ b/src/compiler/glsl/ir_print_visitor.cpp
@@ -199,6 +199,7 @@ void ir_print_visitor::visit(ir_variable *ir)
    const char *const samp = (ir->data.sample) ? "sample " : "";
    const char *const patc = (ir->data.patch) ? "patch " : "";
    const char *const inv = (ir->data.invariant) ? "invariant " : "";
+   const char *const explicit_inv = (ir->data.explicit_invariant) ? "explicit_invariant " : "";
    const char *const prec = (ir->data.precise) ? "precise " : "";
    const char *const bindless = (ir->data.bindless) ? "bindless " : "";
    const char *const bound = (ir->data.bound) ? "bound " : "";
@@ -215,11 +216,11 @@ void ir_print_visitor::visit(ir_variable *ir)
    const char *const interp[] = { "", "smooth", "flat", "noperspective" };
    STATIC_ASSERT(ARRAY_SIZE(interp) == INTERP_MODE_COUNT);
 
-   fprintf(f, "(%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s) ",
+   fprintf(f, "(%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s) ",
            binding, loc, component, cent, bindless, bound,
            image_format, memory_read_only, memory_write_only,
            memory_coherent, memory_volatile, memory_restrict,
-           samp, patc, inv, prec, mode[ir->data.mode],
+           samp, patc, inv, explicit_inv, prec, mode[ir->data.mode],
            stream,
            interp[ir->data.interpolation]);
 
diff --git a/src/compiler/glsl/ir_reader.cpp b/src/compiler/glsl/ir_reader.cpp
index b87933ba511..d4f0e58b155 100644
--- a/src/compiler/glsl/ir_reader.cpp
+++ b/src/compiler/glsl/ir_reader.cpp
@@ -419,8 +419,10 @@ ir_reader::read_declaration(s_expression *expr)
          var->data.sample = 1;
       } else if (strcmp(qualifier->value(), "patch") == 0) {
          var->data.patch = 1;
+      } else if (strcmp(qualifier->value(), "explicit_invariant") == 0) {
+         var->data.explicit_invariant = true;
       } else if (strcmp(qualifier->value(), "invariant") == 0) {
-	 var->data.invariant = 1;
+         var->data.invariant = true;
       } else if (strcmp(qualifier->value(), "uniform") == 0) {
 	 var->data.mode = ir_var_uniform;
       } else if (strcmp(qualifier->value(), "shader_storage") == 0) {
diff --git a/src/compiler/glsl/link_uniform_block_active_visitor.cpp b/src/compiler/glsl/link_uniform_block_active_visitor.cpp
index 368981852c0..0af3b312071 100644
--- a/src/compiler/glsl/link_uniform_block_active_visitor.cpp
+++ b/src/compiler/glsl/link_uniform_block_active_visitor.cpp
@@ -103,6 +103,7 @@ process_arrays(void *mem_ctx, ir_dereference_array *ir,
       if (*ub_array_ptr == NULL) {
          *ub_array_ptr = rzalloc(mem_ctx, struct uniform_block_array_elements);
          (*ub_array_ptr)->ir = ir;
+         (*ub_array_ptr)->original_dim_size = block->type->length;
       }
 
       struct uniform_block_array_elements *ub_array = *ub_array_ptr;
diff --git a/src/compiler/glsl/link_uniform_block_active_visitor.h b/src/compiler/glsl/link_uniform_block_active_visitor.h
index fbac65d5b67..a8ea3f52b6d 100644
--- a/src/compiler/glsl/link_uniform_block_active_visitor.h
+++ b/src/compiler/glsl/link_uniform_block_active_visitor.h
@@ -32,6 +32,7 @@ struct uniform_block_array_elements {
    unsigned num_array_elements;
 
    ir_dereference_array *ir;
+   unsigned original_dim_size;
 
    struct uniform_block_array_elements *array;
 };
diff --git a/src/compiler/glsl/link_uniform_blocks.cpp b/src/compiler/glsl/link_uniform_blocks.cpp
index 0b890586298..1665fc3f8cb 100644
--- a/src/compiler/glsl/link_uniform_blocks.cpp
+++ b/src/compiler/glsl/link_uniform_blocks.cpp
@@ -244,18 +244,21 @@ process_block_array(struct uniform_block_array_elements *ub_array, char **name,
    for (unsigned j = 0; j < ub_array->num_array_elements; j++) {
       size_t new_length = name_length;
 
+      unsigned int element_idx = ub_array->array_elements[j];
       /* Append the subscript to the current variable name */
-      ralloc_asprintf_rewrite_tail(name, &new_length, "[%u]",
-                                   ub_array->array_elements[j]);
+      ralloc_asprintf_rewrite_tail(name, &new_length, "[%u]", element_idx);
 
       if (ub_array->array) {
+         unsigned boffset = (*binding_offset) + (element_idx *
+                             ub_array->original_dim_size);
          process_block_array(ub_array->array, name, new_length, blocks,
                              parcel, variables, b, block_index,
-                             binding_offset, ctx, prog, first_index);
+                             &boffset, ctx, prog, first_index);
       } else {
+         unsigned boffset = (*binding_offset) + element_idx;
          process_block_array_leaf(*name, blocks,
                                   parcel, variables, b, block_index,
-                                  binding_offset, *block_index - first_index,
+                                  &boffset, *block_index - first_index,
                                   ctx, prog);
       }
    }
@@ -307,7 +310,6 @@ process_block_array_leaf(const char *name,
       (unsigned)(ptrdiff_t)(&variables[parcel->index] - blocks[i].Uniforms);
 
    *block_index = *block_index + 1;
-   *binding_offset = *binding_offset + 1;
 }
 
 /* This function resizes the array types of the block so that later we can use
@@ -440,6 +442,7 @@ link_uniform_blocks(void *mem_ctx,
            GLSL_INTERFACE_PACKING_PACKED)) {
          b->type = resize_block_array(b->type, b->array);
          b->var->type = b->type;
+         b->var->data.max_array_access = b->type->length - 1;
       }
 
       block_size.num_active_uniforms = 0;
diff --git a/src/compiler/glsl/link_uniforms.cpp b/src/compiler/glsl/link_uniforms.cpp
index 63e688b19a7..13fc603ce7a 100644
--- a/src/compiler/glsl/link_uniforms.cpp
+++ b/src/compiler/glsl/link_uniforms.cpp
@@ -62,6 +62,15 @@ program_resource_visitor::process(const glsl_type *type, const char *name,
 
 void
 program_resource_visitor::process(ir_variable *var, bool use_std430_as_default)
+{
+   const glsl_type *t =
+      var->data.from_named_ifc_block ? var->get_interface_type() : var->type;
+   process(var, t, use_std430_as_default);
+}
+
+void
+program_resource_visitor::process(ir_variable *var, const glsl_type *var_type,
+                                  bool use_std430_as_default)
 {
    unsigned record_array_count = 1;
    const bool row_major =
@@ -72,8 +81,7 @@ program_resource_visitor::process(ir_variable *var, bool use_std430_as_default)
          get_internal_ifc_packing(use_std430_as_default) :
       var->type->get_internal_ifc_packing(use_std430_as_default);
 
-   const glsl_type *t =
-      var->data.from_named_ifc_block ? var->get_interface_type() : var->type;
+   const glsl_type *t = var_type;
    const glsl_type *t_without_array = t->without_array();
 
    /* false is always passed for the row_major parameter to the other
diff --git a/src/compiler/glsl/link_varyings.cpp b/src/compiler/glsl/link_varyings.cpp
index 3969c0120b3..28187e2f0a4 100644
--- a/src/compiler/glsl/link_varyings.cpp
+++ b/src/compiler/glsl/link_varyings.cpp
@@ -309,16 +309,16 @@ cross_validate_types_and_qualifiers(struct gl_context *ctx,
     *    "The invariance of varyings that are declared in both the vertex
     *     and fragment shaders must match."
     */
-   if (input->data.invariant != output->data.invariant &&
+   if (input->data.explicit_invariant != output->data.explicit_invariant &&
        prog->data->Version < (prog->IsES ? 300 : 430)) {
       linker_error(prog,
                    "%s shader output `%s' %s invariant qualifier, "
                    "but %s shader input %s invariant qualifier\n",
                    _mesa_shader_stage_to_string(producer_stage),
                    output->name,
-                   (output->data.invariant) ? "has" : "lacks",
+                   (output->data.explicit_invariant) ? "has" : "lacks",
                    _mesa_shader_stage_to_string(consumer_stage),
-                   (input->data.invariant) ? "has" : "lacks");
+                   (input->data.explicit_invariant) ? "has" : "lacks");
       return;
    }
 
@@ -424,28 +424,14 @@ compute_variable_location_slot(ir_variable *var, gl_shader_stage stage)
 
 struct explicit_location_info {
    ir_variable *var;
-   unsigned numerical_type;
+   bool base_type_is_integer;
+   unsigned base_type_bit_size;
    unsigned interpolation;
    bool centroid;
    bool sample;
    bool patch;
 };
 
-static inline unsigned
-get_numerical_type(const glsl_type *type)
-{
-   /* From the OpenGL 4.6 spec, section 4.4.1 Input Layout Qualifiers, Page 68,
-    * (Location aliasing):
-    *
-    *    "Further, when location aliasing, the aliases sharing the location
-    *     must have the same underlying numerical type  (floating-point or
-    *     integer)
-    */
-   if (type->is_float() || type->is_double())
-      return GLSL_TYPE_FLOAT;
-   return GLSL_TYPE_INT;
-}
-
 static bool
 check_location_aliasing(struct explicit_location_info explicit_locations[][4],
                         ir_variable *var,
@@ -461,14 +447,23 @@ check_location_aliasing(struct explicit_location_info explicit_locations[][4],
                         gl_shader_stage stage)
 {
    unsigned last_comp;
-   if (type->without_array()->is_record()) {
-      /* The component qualifier can't be used on structs so just treat
-       * all component slots as used.
+   unsigned base_type_bit_size;
+   const glsl_type *type_without_array = type->without_array();
+   const bool base_type_is_integer =
+      glsl_base_type_is_integer(type_without_array->base_type);
+   const bool is_struct = type_without_array->is_record();
+   if (is_struct) {
+      /* structs don't have a defined underlying base type so just treat all
+       * component slots as used and set the bit size to 0. If there is
+       * location aliasing, we'll fail anyway later.
        */
       last_comp = 4;
+      base_type_bit_size = 0;
    } else {
-      unsigned dmul = type->without_array()->is_64bit() ? 2 : 1;
-      last_comp = component + type->without_array()->vector_elements * dmul;
+      unsigned dmul = type_without_array->is_64bit() ? 2 : 1;
+      last_comp = component + type_without_array->vector_elements * dmul;
+      base_type_bit_size =
+         glsl_base_type_get_bit_size(type_without_array->base_type);
    }
 
    while (location < location_limit) {
@@ -478,8 +473,22 @@ check_location_aliasing(struct explicit_location_info explicit_locations[][4],
             &explicit_locations[location][comp];
 
          if (info->var) {
-            /* Component aliasing is not alloed */
-            if (comp >= component && comp < last_comp) {
+            if (info->var->type->without_array()->is_record() || is_struct) {
+               /* Structs cannot share location since they are incompatible
+                * with any other underlying numerical type.
+                */
+               linker_error(prog,
+                            "%s shader has multiple %sputs sharing the "
+                            "same location that don't have the same "
+                            "underlying numerical type. Struct variable '%s', "
+                            "location %u\n",
+                            _mesa_shader_stage_to_string(stage),
+                            var->data.mode == ir_var_shader_in ? "in" : "out",
+                            is_struct ? var->name : info->var->name,
+                            location);
+               return false;
+            } else if (comp >= component && comp < last_comp) {
+               /* Component aliasing is not allowed */
                linker_error(prog,
                             "%s shader has multiple %sputs explicitly "
                             "assigned to location %d and component %d\n",
@@ -488,27 +497,52 @@ check_location_aliasing(struct explicit_location_info explicit_locations[][4],
                             location, comp);
                return false;
             } else {
-               /* For all other used components we need to have matching
-                * types, interpolation and auxiliary storage
+               /* From the OpenGL 4.60.5 spec, section 4.4.1 Input Layout
+                * Qualifiers, Page 67, (Location aliasing):
+                *
+                *   " Further, when location aliasing, the aliases sharing the
+                *     location must have the same underlying numerical type
+                *     and bit width (floating-point or integer, 32-bit versus
+                *     64-bit, etc.) and the same auxiliary storage and
+                *     interpolation qualification."
                 */
-               if (info->numerical_type !=
-                   get_numerical_type(type->without_array())) {
+
+               /* If the underlying numerical type isn't integer, implicitly
+                * it will be float or else we would have failed by now.
+                */
+               if (info->base_type_is_integer != base_type_is_integer) {
                   linker_error(prog,
-                               "Varyings sharing the same location must "
-                               "have the same underlying numerical type. "
-                               "Location %u component %u\n",
-                               location, comp);
+                               "%s shader has multiple %sputs sharing the "
+                               "same location that don't have the same "
+                               "underlying numerical type. Location %u "
+                               "component %u.\n",
+                               _mesa_shader_stage_to_string(stage),
+                               var->data.mode == ir_var_shader_in ?
+                               "in" : "out", location, comp);
+                  return false;
+               }
+
+               if (info->base_type_bit_size != base_type_bit_size) {
+                  linker_error(prog,
+                               "%s shader has multiple %sputs sharing the "
+                               "same location that don't have the same "
+                               "underlying numerical bit size. Location %u "
+                               "component %u.\n",
+                               _mesa_shader_stage_to_string(stage),
+                               var->data.mode == ir_var_shader_in ?
+                               "in" : "out", location, comp);
                   return false;
                }
 
                if (info->interpolation != interpolation) {
                   linker_error(prog,
-                               "%s shader has multiple %sputs at explicit "
-                               "location %u with different interpolation "
-                               "settings\n",
+                               "%s shader has multiple %sputs sharing the "
+                               "same location that don't have the same "
+                               "interpolation qualification. Location %u "
+                               "component %u.\n",
                                _mesa_shader_stage_to_string(stage),
                                var->data.mode == ir_var_shader_in ?
-                               "in" : "out", location);
+                               "in" : "out", location, comp);
                   return false;
                }
 
@@ -516,17 +550,20 @@ check_location_aliasing(struct explicit_location_info explicit_locations[][4],
                    info->sample != sample ||
                    info->patch != patch) {
                   linker_error(prog,
-                               "%s shader has multiple %sputs at explicit "
-                               "location %u with different aux storage\n",
+                               "%s shader has multiple %sputs sharing the "
+                               "same location that don't have the same "
+                               "auxiliary storage qualification. Location %u "
+                               "component %u.\n",
                                _mesa_shader_stage_to_string(stage),
                                var->data.mode == ir_var_shader_in ?
-                               "in" : "out", location);
+                               "in" : "out", location, comp);
                   return false;
                }
             }
          } else if (comp >= component && comp < last_comp) {
             info->var = var;
-            info->numerical_type = get_numerical_type(type->without_array());
+            info->base_type_is_integer = base_type_is_integer;
+            info->base_type_bit_size = base_type_bit_size;
             info->interpolation = interpolation;
             info->centroid = centroid;
             info->sample = sample;
@@ -773,8 +810,20 @@ cross_validate_outputs_to_inputs(struct gl_context *ctx,
 
                output = explicit_locations[idx][input->data.location_frac].var;
 
-               if (output == NULL ||
-                   input->data.location != output->data.location) {
+               if (output == NULL) {
+                  /* A linker failure should only happen when there is no
+                   * output declaration and there is Static Use of the
+                   * declared input.
+                   */
+                  if (input->data.used) {
+                     linker_error(prog,
+                                  "%s shader input `%s' with explicit location "
+                                  "has no matching output\n",
+                                  _mesa_shader_stage_to_string(consumer->Stage),
+                                  input->name);
+                     break;
+                  }
+               } else if (input->data.location != output->data.location) {
                   linker_error(prog,
                                "%s shader input `%s' with explicit location "
                                "has no matching output\n",
@@ -804,7 +853,7 @@ cross_validate_outputs_to_inputs(struct gl_context *ctx,
              */
             assert(!input->data.assigned);
             if (input->data.used && !input->get_interface_type() &&
-                !input->data.explicit_location && !prog->SeparateShader)
+                !input->data.explicit_location)
                linker_error(prog,
                             "%s shader input `%s' "
                             "has no matching output in the previous stage\n",
@@ -1166,8 +1215,7 @@ tfeedback_decl::store(struct gl_context *ctx, struct gl_shader_program *prog,
          return false;
       }
 
-      if ((this->offset / 4) / info->Buffers[buffer].Stride !=
-          (xfb_offset - 1) / info->Buffers[buffer].Stride) {
+      if (xfb_offset > info->Buffers[buffer].Stride) {
          linker_error(prog, "xfb_offset (%d) overflows xfb_stride (%d) for "
                       "buffer (%d)", xfb_offset * 4,
                       info->Buffers[buffer].Stride * 4, buffer);
@@ -2124,9 +2172,11 @@ class tfeedback_candidate_generator : public program_resource_visitor
 {
 public:
    tfeedback_candidate_generator(void *mem_ctx,
-                                 hash_table *tfeedback_candidates)
+                                 hash_table *tfeedback_candidates,
+                                 gl_shader_stage stage)
       : mem_ctx(mem_ctx),
         tfeedback_candidates(tfeedback_candidates),
+        stage(stage),
         toplevel_var(NULL),
         varying_floats(0)
    {
@@ -2136,10 +2186,17 @@ class tfeedback_candidate_generator : public program_resource_visitor
    {
       /* All named varying interface blocks should be flattened by now */
       assert(!var->is_interface_instance());
+      assert(var->data.mode == ir_var_shader_out);
 
       this->toplevel_var = var;
       this->varying_floats = 0;
-      program_resource_visitor::process(var, false);
+      const glsl_type *t =
+         var->data.from_named_ifc_block ? var->get_interface_type() : var->type;
+      if (!var->data.patch && stage == MESA_SHADER_TESS_CTRL) {
+         assert(t->is_array());
+         t = t->fields.array;
+      }
+      program_resource_visitor::process(var, t, false);
    }
 
 private:
@@ -2173,6 +2230,8 @@ class tfeedback_candidate_generator : public program_resource_visitor
     */
    hash_table * const tfeedback_candidates;
 
+   gl_shader_stage stage;
+
    /**
     * Pointer to the toplevel variable that is being traversed.
     */
@@ -2503,8 +2562,28 @@ assign_varying_locations(struct gl_context *ctx,
                  producer->Stage == MESA_SHADER_GEOMETRY));
 
          if (num_tfeedback_decls > 0) {
-            tfeedback_candidate_generator g(mem_ctx, tfeedback_candidates);
-            g.process(output_var);
+            tfeedback_candidate_generator g(mem_ctx, tfeedback_candidates, producer->Stage);
+            /* From OpenGL 4.6 (Core Profile) spec, section 11.1.2.1
+             * ("Vertex Shader Variables / Output Variables")
+             *
+             * "Each program object can specify a set of output variables from
+             * one shader to be recorded in transform feedback mode (see
+             * section 13.3). The variables that can be recorded are those
+             * emitted by the first active shader, in order, from the
+             * following list:
+             *
+             *  * geometry shader
+             *  * tessellation evaluation shader
+             *  * tessellation control shader
+             *  * vertex shader"
+             *
+             * But on OpenGL ES 3.2, section 11.1.2.1 ("Vertex Shader
+             * Variables / Output Variables") tessellation control shader is
+             * not included in the stages list.
+             */
+            if (!prog->IsES || producer->Stage != MESA_SHADER_TESS_CTRL) {
+               g.process(output_var);
+            }
          }
 
          ir_variable *const input_var =
diff --git a/src/compiler/glsl/linker.cpp b/src/compiler/glsl/linker.cpp
index 2d76e852f47..0d9b1befdd5 100644
--- a/src/compiler/glsl/linker.cpp
+++ b/src/compiler/glsl/linker.cpp
@@ -1090,7 +1090,7 @@ cross_validate_globals(struct gl_context *ctx, struct gl_shader_program *prog,
             }
          }
 
-         if (existing->data.invariant != var->data.invariant) {
+         if (existing->data.explicit_invariant != var->data.explicit_invariant) {
             linker_error(prog, "declarations for %s `%s' have "
                          "mismatching invariant qualifiers\n",
                          mode_string(var), var->name);
diff --git a/src/compiler/glsl/linker.h b/src/compiler/glsl/linker.h
index f6fb00351d4..be92dbf983c 100644
--- a/src/compiler/glsl/linker.h
+++ b/src/compiler/glsl/linker.h
@@ -134,6 +134,26 @@ class program_resource_visitor {
     */
    void process(ir_variable *var, bool use_std430_as_default);
 
+   /**
+    * Begin processing a variable
+    *
+    * Classes that overload this function should call \c ::process from the
+    * base class to start the recursive processing of the variable.
+    *
+    * \param var  The variable that is to be processed
+    * \param var_type The glsl_type reference of the variable
+    *
+    * Calls \c ::visit_field for each leaf of the variable.
+    *
+    * \warning
+    * When processing a uniform block, this entry should only be used in cases
+    * where the row / column ordering of matrices in the block does not
+    * matter.  For example, enumerating the names of members of the block, but
+    * not for determining the offsets of members.
+    */
+   void process(ir_variable *var, const glsl_type *var_type,
+                bool use_std430_as_default);
+
    /**
     * Begin processing a variable of a structured type.
     *
diff --git a/src/compiler/glsl/list.h b/src/compiler/glsl/list.h
index 59ed766f2e1..979f6fcc539 100644
--- a/src/compiler/glsl/list.h
+++ b/src/compiler/glsl/list.h
@@ -81,6 +81,12 @@ struct exec_node {
     * Insert a node in the list after the current node
     */
    void insert_after(exec_node *after);
+
+   /**
+    * Insert another list in the list after the current node
+    */
+   void insert_after(struct exec_list *after);
+
    /**
     * Insert a node in the list before the current node
     */
@@ -507,6 +513,21 @@ exec_list_append(struct exec_list *list, struct exec_list *source)
    exec_list_make_empty(source);
 }
 
+static inline void
+exec_node_insert_list_after(struct exec_node *n, struct exec_list *after)
+{
+   if (exec_list_is_empty(after))
+      return;
+
+   after->tail_sentinel.prev->next = n->next;
+   after->head_sentinel.next->prev = n;
+
+   n->next->prev = after->tail_sentinel.prev;
+   n->next = after->head_sentinel.next;
+
+   exec_list_make_empty(after);
+}
+
 static inline void
 exec_list_prepend(struct exec_list *list, struct exec_list *source)
 {
@@ -635,6 +656,11 @@ inline void exec_list::append_list(exec_list *source)
    exec_list_append(this, source);
 }
 
+inline void exec_node::insert_after(exec_list *after)
+{
+   exec_node_insert_list_after(this, after);
+}
+
 inline void exec_list::prepend_list(exec_list *source)
 {
    exec_list_prepend(this, source);
diff --git a/src/compiler/glsl/lower_vector_derefs.cpp b/src/compiler/glsl/lower_vector_derefs.cpp
index 6cd9a2d819a..2aae30d8201 100644
--- a/src/compiler/glsl/lower_vector_derefs.cpp
+++ b/src/compiler/glsl/lower_vector_derefs.cpp
@@ -32,8 +32,9 @@ namespace {
 
 class vector_deref_visitor : public ir_rvalue_enter_visitor {
 public:
-   vector_deref_visitor()
-      : progress(false)
+   vector_deref_visitor(void *mem_ctx, gl_shader_stage shader_stage)
+      : progress(false), shader_stage(shader_stage),
+        factory(&factory_instructions, mem_ctx)
    {
    }
 
@@ -45,6 +46,9 @@ class vector_deref_visitor : public ir_rvalue_enter_visitor {
    virtual ir_visitor_status visit_enter(ir_assignment *ir);
 
    bool progress;
+   gl_shader_stage shader_stage;
+   exec_list factory_instructions;
+   ir_factory factory;
 };
 
 } /* anonymous namespace */
@@ -65,13 +69,63 @@ vector_deref_visitor::visit_enter(ir_assignment *ir)
    ir_constant *old_index_constant =
       deref->array_index->constant_expression_value(mem_ctx);
    if (!old_index_constant) {
-      ir->rhs = new(mem_ctx) ir_expression(ir_triop_vector_insert,
-                                           new_lhs->type,
-                                           new_lhs->clone(mem_ctx, NULL),
-                                           ir->rhs,
-                                           deref->array_index);
-      ir->write_mask = (1 << new_lhs->type->vector_elements) - 1;
-      ir->set_lhs(new_lhs);
+      if (shader_stage == MESA_SHADER_TESS_CTRL &&
+          deref->variable_referenced()->data.mode == ir_var_shader_out) {
+         /* Tessellation control shader outputs act as if they have memory
+          * backing them and if we have writes from multiple threads
+          * targeting the same vec4 (this can happen for patch outputs), the
+          * load-vec-store pattern of ir_triop_vector_insert doesn't work.
+          * Instead, we have to lower to a series of conditional write-masked
+          * assignments.
+          */
+         ir_variable *const src_temp =
+            factory.make_temp(ir->rhs->type, "scalar_tmp");
+
+         /* The newly created variable declaration goes before the assignment
+          * because we're going to set it as the new LHS.
+          */
+         ir->insert_before(factory.instructions);
+         ir->set_lhs(new(mem_ctx) ir_dereference_variable(src_temp));
+
+         ir_variable *const arr_index =
+            factory.make_temp(deref->array_index->type, "index_tmp");
+         factory.emit(assign(arr_index, deref->array_index));
+
+         for (unsigned i = 0; i < new_lhs->type->vector_elements; i++) {
+            ir_constant *const cmp_index =
+               ir_constant::zero(factory.mem_ctx, deref->array_index->type);
+            cmp_index->value.u[0] = i;
+
+            ir_rvalue *const lhs_clone = new_lhs->clone(factory.mem_ctx, NULL);
+            ir_dereference_variable *const src_temp_deref =
+               new(mem_ctx) ir_dereference_variable(src_temp);
+
+            if (new_lhs->ir_type != ir_type_swizzle) {
+               assert(lhs_clone->as_dereference());
+               ir_assignment *cond_assign =
+                  new(mem_ctx) ir_assignment(lhs_clone->as_dereference(),
+                                             src_temp_deref,
+                                             equal(arr_index, cmp_index),
+                                             WRITEMASK_X << i);
+               factory.emit(cond_assign);
+            } else {
+               ir_assignment *cond_assign =
+                  new(mem_ctx) ir_assignment(swizzle(lhs_clone, i, 1),
+                                             src_temp_deref,
+                                             equal(arr_index, cmp_index));
+               factory.emit(cond_assign);
+            }
+         }
+         ir->insert_after(factory.instructions);
+      } else {
+         ir->rhs = new(mem_ctx) ir_expression(ir_triop_vector_insert,
+                                              new_lhs->type,
+                                              new_lhs->clone(mem_ctx, NULL),
+                                              ir->rhs,
+                                              deref->array_index);
+         ir->write_mask = (1 << new_lhs->type->vector_elements) - 1;
+         ir->set_lhs(new_lhs);
+      }
    } else if (new_lhs->ir_type != ir_type_swizzle) {
       ir->set_lhs(new_lhs);
       ir->write_mask = 1 << old_index_constant->get_uint_component(0);
@@ -105,7 +159,7 @@ vector_deref_visitor::handle_rvalue(ir_rvalue **rv)
 bool
 lower_vector_derefs(gl_linked_shader *shader)
 {
-   vector_deref_visitor v;
+   vector_deref_visitor v(shader->ir, shader->Stage);
 
    visit_list_elements(&v, shader->ir);
 
diff --git a/src/compiler/glsl/serialize.cpp b/src/compiler/glsl/serialize.cpp
index fdd99ec59da..ad258f8bcb1 100644
--- a/src/compiler/glsl/serialize.cpp
+++ b/src/compiler/glsl/serialize.cpp
@@ -996,15 +996,14 @@ write_shader_parameters(struct blob *metadata,
                         struct gl_program_parameter_list *params)
 {
    blob_write_uint32(metadata, params->NumParameters);
-   blob_write_uint32(metadata, params->NumParameterValues);
    uint32_t i = 0;
 
    while (i < params->NumParameters) {
       struct gl_program_parameter *param = &params->Parameters[i];
-
       blob_write_uint32(metadata, param->Type);
       blob_write_string(metadata, param->Name);
       blob_write_uint32(metadata, param->Size);
+      blob_write_uint32(metadata, param->Padded);
       blob_write_uint32(metadata, param->DataType);
       blob_write_bytes(metadata, param->StateIndexes,
                        sizeof(param->StateIndexes));
@@ -1015,9 +1014,6 @@ write_shader_parameters(struct blob *metadata,
    blob_write_bytes(metadata, params->ParameterValues,
                     sizeof(gl_constant_value) * params->NumParameterValues);
 
-   blob_write_bytes(metadata, params->ParameterValueOffset,
-                    sizeof(uint32_t) * params->NumParameters);
-
    blob_write_uint32(metadata, params->StateFlags);
 }
 
@@ -1028,28 +1024,25 @@ read_shader_parameters(struct blob_reader *metadata,
    gl_state_index16 state_indexes[STATE_LENGTH];
    uint32_t i = 0;
    uint32_t num_parameters = blob_read_uint32(metadata);
-   uint32_t num_parameters_values = blob_read_uint32(metadata);
 
    _mesa_reserve_parameter_storage(params, num_parameters);
    while (i < num_parameters) {
       gl_register_file type = (gl_register_file) blob_read_uint32(metadata);
       const char *name = blob_read_string(metadata);
       unsigned size = blob_read_uint32(metadata);
+      bool padded = blob_read_uint32(metadata);
       unsigned data_type = blob_read_uint32(metadata);
       blob_copy_bytes(metadata, (uint8_t *) state_indexes,
                       sizeof(state_indexes));
 
       _mesa_add_parameter(params, type, name, size, data_type,
-                          NULL, state_indexes, false);
+                          NULL, state_indexes, padded);
 
       i++;
    }
 
    blob_copy_bytes(metadata, (uint8_t *) params->ParameterValues,
-                   sizeof(gl_constant_value) * num_parameters_values);
-
-   blob_copy_bytes(metadata, (uint8_t *) params->ParameterValueOffset,
-                   sizeof(uint32_t) * num_parameters);
+                   sizeof(gl_constant_value) * params->NumParameterValues);
 
    params->StateFlags = blob_read_uint32(metadata);
 }
diff --git a/src/compiler/glsl_types.cpp b/src/compiler/glsl_types.cpp
index 90f4548030f..042f45a926d 100644
--- a/src/compiler/glsl_types.cpp
+++ b/src/compiler/glsl_types.cpp
@@ -260,6 +260,22 @@ glsl_type::contains_double() const
    }
 }
 
+bool
+glsl_type::contains_64bit() const
+{
+   if (this->is_array()) {
+      return this->fields.array->contains_64bit();
+   } else if (this->is_record() || this->is_interface()) {
+      for (unsigned int i = 0; i < this->length; i++) {
+         if (this->fields.structure[i].type->contains_64bit())
+            return true;
+      }
+      return false;
+   } else {
+      return this->is_64bit();
+   }
+}
+
 bool
 glsl_type::contains_opaque() const {
    switch (base_type) {
diff --git a/src/compiler/glsl_types.h b/src/compiler/glsl_types.h
index bdaeee7ddd7..4767d197449 100644
--- a/src/compiler/glsl_types.h
+++ b/src/compiler/glsl_types.h
@@ -31,6 +31,7 @@
 #include "shader_enums.h"
 #include "blob.h"
 #include "c11/threads.h"
+#include "util/macros.h"
 
 #ifdef __cplusplus
 #include "main/config.h"
@@ -114,6 +115,42 @@ static inline bool glsl_base_type_is_integer(enum glsl_base_type type)
           type == GLSL_TYPE_IMAGE;
 }
 
+static inline unsigned int
+glsl_base_type_get_bit_size(const enum glsl_base_type base_type)
+{
+   switch (base_type) {
+   case GLSL_TYPE_BOOL:
+      return 1;
+
+   case GLSL_TYPE_INT:
+   case GLSL_TYPE_UINT:
+   case GLSL_TYPE_FLOAT: /* TODO handle mediump */
+   case GLSL_TYPE_SUBROUTINE:
+      return 32;
+
+   case GLSL_TYPE_FLOAT16:
+   case GLSL_TYPE_UINT16:
+   case GLSL_TYPE_INT16:
+      return 16;
+
+   case GLSL_TYPE_UINT8:
+   case GLSL_TYPE_INT8:
+      return 8;
+
+   case GLSL_TYPE_DOUBLE:
+   case GLSL_TYPE_INT64:
+   case GLSL_TYPE_UINT64:
+   case GLSL_TYPE_IMAGE:
+   case GLSL_TYPE_SAMPLER:
+      return 64;
+
+   default:
+      unreachable("unknown base type");
+   }
+
+   return 0;
+}
+
 enum glsl_sampler_dim {
    GLSL_SAMPLER_DIM_1D = 0,
    GLSL_SAMPLER_DIM_2D,
@@ -544,6 +581,12 @@ struct glsl_type {
     */
    bool contains_double() const;
 
+   /**
+    * Query whether or not type is a 64-bit type, or for struct, interface and
+    * array types, contains a double type.
+    */
+   bool contains_64bit() const;
+
    /**
     * Query whether or not a type is a float type
     */
diff --git a/src/compiler/nir/meson.build b/src/compiler/nir/meson.build
index 20a26a26255..e6784fcd41f 100644
--- a/src/compiler/nir/meson.build
+++ b/src/compiler/nir/meson.build
@@ -112,6 +112,7 @@ files_libnir = files(
   'nir_lower_alu.c',
   'nir_lower_alu_to_scalar.c',
   'nir_lower_alpha_test.c',
+  'nir_lower_array_deref_of_vec.c',
   'nir_lower_atomics_to_ssbo.c',
   'nir_lower_bitmap.c',
   'nir_lower_bool_to_float.c',
@@ -133,6 +134,7 @@ files_libnir = files(
   'nir_lower_io_arrays_to_elements.c',
   'nir_lower_io_to_temporaries.c',
   'nir_lower_io_to_scalar.c',
+  'nir_lower_io_to_vector.c',
   'nir_lower_packing.c',
   'nir_lower_passthrough_edgeflags.c',
   'nir_lower_patch_vertices.c',
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index ff2c41faf27..c43226ba8df 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -2825,7 +2825,7 @@ should_print_nir(void)
 static inline void nir_validate_shader(nir_shader *shader, const char *when) { (void) shader; (void)when; }
 static inline void nir_metadata_set_validation_flag(nir_shader *shader) { (void) shader; }
 static inline void nir_metadata_check_validation_flag(nir_shader *shader) { (void) shader; }
-static inline bool should_skip_nir(const char *pass_name) { return false; }
+static inline bool should_skip_nir(UNUSED const char *pass_name) { return false; }
 static inline bool should_clone_nir(void) { return false; }
 static inline bool should_serialize_deserialize_nir(void) { return false; }
 static inline bool should_print_nir(void) { return false; }
@@ -2910,6 +2910,16 @@ void nir_fixup_deref_modes(nir_shader *shader);
 
 bool nir_lower_global_vars_to_local(nir_shader *shader);
 
+typedef enum {
+   nir_lower_direct_array_deref_of_vec_load     = (1 << 0),
+   nir_lower_indirect_array_deref_of_vec_load   = (1 << 1),
+   nir_lower_direct_array_deref_of_vec_store    = (1 << 2),
+   nir_lower_indirect_array_deref_of_vec_store  = (1 << 3),
+} nir_lower_array_deref_of_vec_options;
+
+bool nir_lower_array_deref_of_vec(nir_shader *shader, nir_variable_mode modes,
+                                  nir_lower_array_deref_of_vec_options options);
+
 bool nir_lower_indirect_derefs(nir_shader *shader, nir_variable_mode modes);
 
 bool nir_lower_locals_to_regs(nir_shader *shader);
@@ -2998,6 +3008,7 @@ void nir_lower_io_arrays_to_elements_no_indirects(nir_shader *shader,
                                                   bool outputs_only);
 void nir_lower_io_to_scalar(nir_shader *shader, nir_variable_mode mask);
 void nir_lower_io_to_scalar_early(nir_shader *shader, nir_variable_mode mask);
+bool nir_lower_io_to_vector(nir_shader *shader, nir_variable_mode mask);
 
 typedef struct nir_lower_subgroups_options {
    uint8_t subgroup_size;
@@ -3090,6 +3101,9 @@ typedef struct nir_lower_tex_options {
     */
    uint8_t swizzles[32][4];
 
+   /* Can be used to scale sampled values in range required by the format. */
+   float scale_factors[32];
+
    /**
     * Bitmap of textures that need srgb to linear conversion.  If
     * (lower_srgb & (1 << texture_index)) then the rgb (xyz) components
@@ -3138,6 +3152,12 @@ typedef struct nir_lower_tex_options {
     */
    bool lower_txd_offset_clamp;
 
+   /**
+    * If true, lower nir_texop_txd with min_lod to a nir_texop_txl if the
+    * sampler index is not statically determinable to be less than 16.
+    */
+   bool lower_txd_clamp_if_sampler_index_not_lt_16;
+
    /**
     * If true, apply a .bagr swizzle on tg4 results to handle Broadcom's
     * mixed-up tg4 locations.
@@ -3316,7 +3336,7 @@ bool nir_opt_move_comparisons(nir_shader *shader);
 bool nir_opt_move_load_ubo(nir_shader *shader);
 
 bool nir_opt_peephole_select(nir_shader *shader, unsigned limit,
-                             bool indirect_load_ok, bool expensive_alu_ok);
+                             bool indirect_load_ok);
 
 bool nir_opt_remove_phis(nir_shader *shader);
 
diff --git a/src/compiler/nir/nir_builder.h b/src/compiler/nir/nir_builder.h
index 2a36eb3c91b..101bc7ad637 100644
--- a/src/compiler/nir/nir_builder.h
+++ b/src/compiler/nir/nir_builder.h
@@ -560,6 +560,35 @@ nir_channels(nir_builder *b, nir_ssa_def *def, nir_component_mask_t mask)
    return nir_swizzle(b, def, swizzle, num_channels, false);
 }
 
+static inline nir_ssa_def *
+_nir_vector_extract_helper(nir_builder *b, nir_ssa_def *vec, nir_ssa_def *c,
+                           unsigned start, unsigned end)
+{
+   if (start == end - 1) {
+      return nir_channel(b, vec, start);
+   } else {
+      unsigned mid = start + (end - start) / 2;
+      return nir_bcsel(b, nir_ilt(b, c, nir_imm_int(b, mid)),
+                       _nir_vector_extract_helper(b, vec, c, start, mid),
+                       _nir_vector_extract_helper(b, vec, c, mid, end));
+   }
+}
+
+static inline nir_ssa_def *
+nir_vector_extract(nir_builder *b, nir_ssa_def *vec, nir_ssa_def *c)
+{
+   nir_src c_src = nir_src_for_ssa(c);
+   if (nir_src_is_const(c_src)) {
+      unsigned c_const = nir_src_as_uint(c_src);
+      if (c_const < vec->num_components)
+         return nir_channel(b, vec, c_const);
+      else
+         return nir_ssa_undef(b, 1, vec->bit_size);
+   } else {
+      return _nir_vector_extract_helper(b, vec, c, 0, vec->num_components);
+   }
+}
+
 static inline nir_ssa_def *
 nir_i2i(nir_builder *build, nir_ssa_def *x, unsigned dest_bit_size)
 {
diff --git a/src/compiler/nir/nir_clone.c b/src/compiler/nir/nir_clone.c
index 557c7d29f53..24bef4f523a 100644
--- a/src/compiler/nir/nir_clone.c
+++ b/src/compiler/nir/nir_clone.c
@@ -151,9 +151,11 @@ nir_variable_clone(const nir_variable *var, nir_shader *shader)
    nvar->name = ralloc_strdup(nvar, var->name);
    nvar->data = var->data;
    nvar->num_state_slots = var->num_state_slots;
-   nvar->state_slots = ralloc_array(nvar, nir_state_slot, var->num_state_slots);
-   memcpy(nvar->state_slots, var->state_slots,
-          var->num_state_slots * sizeof(nir_state_slot));
+   if (var->num_state_slots) {
+      nvar->state_slots = ralloc_array(nvar, nir_state_slot, var->num_state_slots);
+      memcpy(nvar->state_slots, var->state_slots,
+             var->num_state_slots * sizeof(nir_state_slot));
+   }
    if (var->constant_initializer) {
       nvar->constant_initializer =
          nir_constant_clone(var->constant_initializer, nvar);
diff --git a/src/compiler/nir/nir_deref.c b/src/compiler/nir/nir_deref.c
index 2f5fda643ca..1e321a66208 100644
--- a/src/compiler/nir/nir_deref.c
+++ b/src/compiler/nir/nir_deref.c
@@ -215,7 +215,7 @@ nir_build_deref_offset(nir_builder *b, nir_deref_instr *deref,
          unsigned field_offset =
             struct_type_get_field_offset(parent->type, size_align,
                                          (*p)->strct.index);
-         nir_iadd(b, offset, nir_imm_int(b, field_offset));
+         offset = nir_iadd(b, offset, nir_imm_int(b, field_offset));
       } else {
          unreachable("Unsupported deref type");
       }
@@ -574,10 +574,9 @@ nir_rematerialize_derefs_in_use_blocks_impl(nir_function_impl *impl)
          _mesa_hash_table_clear(state.cache, NULL);
 
       nir_foreach_instr_safe(instr, block) {
-         if (instr->type == nir_instr_type_deref) {
-            nir_deref_instr_remove_if_unused(nir_instr_as_deref(instr));
+         if (instr->type == nir_instr_type_deref &&
+             nir_deref_instr_remove_if_unused(nir_instr_as_deref(instr)))
             continue;
-         }
 
          state.builder.cursor = nir_before_instr(instr);
          nir_foreach_src(instr, rematerialize_deref_src, &state);
diff --git a/src/compiler/nir/nir_gather_xfb_info.c b/src/compiler/nir/nir_gather_xfb_info.c
index 96f0ece5e75..f2a2c0e6b99 100644
--- a/src/compiler/nir/nir_gather_xfb_info.c
+++ b/src/compiler/nir/nir_gather_xfb_info.c
@@ -33,7 +33,11 @@ add_var_xfb_outputs(nir_xfb_info *xfb,
                     unsigned *offset,
                     const struct glsl_type *type)
 {
-   if (glsl_type_is_array(type) || glsl_type_is_matrix(type)) {
+   /* If this type contains a 64-bit value, align to 8 bytes */
+   if (glsl_type_contains_64bit(type))
+      *offset = ALIGN_POT(*offset, 8);
+
+   if (glsl_type_is_array_or_matrix(type) && !var->data.compact) {
       unsigned length = glsl_get_length(type);
       const struct glsl_type *child_type = glsl_get_array_element(type);
       for (unsigned i = 0; i < length; i++)
@@ -58,32 +62,43 @@ add_var_xfb_outputs(nir_xfb_info *xfb,
       assert(var->data.stream < NIR_MAX_XFB_STREAMS);
       xfb->streams_written |= (1 << var->data.stream);
 
-      unsigned comp_slots = glsl_get_component_slots(type);
-      unsigned attrib_slots = DIV_ROUND_UP(comp_slots, 4);
-      assert(attrib_slots == glsl_count_attribute_slots(type, false));
-
-      /* Ensure that we don't have, for instance, a dvec2 with a location_frac
-       * of 2 which would make it crass a location boundary even though it
-       * fits in a single slot.  However, you can have a dvec3 which crosses
-       * the slot boundary with a location_frac of 2.
-       */
-      assert(DIV_ROUND_UP(var->data.location_frac + comp_slots, 4) == attrib_slots);
+      unsigned comp_slots;
+      if (var->data.compact) {
+         /* This only happens for clip/cull which are float arrays */
+         assert(glsl_without_array(type) == glsl_float_type());
+         assert(var->data.location == VARYING_SLOT_CLIP_DIST0 ||
+                var->data.location == VARYING_SLOT_CLIP_DIST1);
+         comp_slots = glsl_get_length(type);
+      } else {
+         comp_slots = glsl_get_component_slots(type);
+
+         unsigned attrib_slots = DIV_ROUND_UP(comp_slots, 4);
+         assert(attrib_slots == glsl_count_attribute_slots(type, false));
+
+         /* Ensure that we don't have, for instance, a dvec2 with a
+          * location_frac of 2 which would make it crass a location boundary
+          * even though it fits in a single slot.  However, you can have a
+          * dvec3 which crosses the slot boundary with a location_frac of 2.
+          */
+         assert(DIV_ROUND_UP(var->data.location_frac + comp_slots, 4) ==
+                attrib_slots);
+      }
 
       assert(var->data.location_frac + comp_slots <= 8);
       uint8_t comp_mask = ((1 << comp_slots) - 1) << var->data.location_frac;
 
-      assert(attrib_slots <= 2);
-      for (unsigned s = 0; s < attrib_slots; s++) {
+      while (comp_mask) {
          nir_xfb_output_info *output = &xfb->outputs[xfb->output_count++];
 
          output->buffer = buffer;
-         output->offset = *offset + s * 16;
+         output->offset = *offset;
          output->location = *location;
-         output->component_mask = (comp_mask >> (s * 4)) & 0xf;
+         output->component_mask = comp_mask & 0xf;
 
+         *offset += util_bitcount(output->component_mask) * 4;
          (*location)++;
+         comp_mask >>= 4;
       }
-      *offset += comp_slots * 4;
    }
 }
 
diff --git a/src/compiler/nir/nir_linking_helpers.c b/src/compiler/nir/nir_linking_helpers.c
index aaa4204cce9..764fd6d443e 100644
--- a/src/compiler/nir/nir_linking_helpers.c
+++ b/src/compiler/nir/nir_linking_helpers.c
@@ -59,6 +59,15 @@ get_variable_io_mask(nir_variable *var, gl_shader_stage stage)
    return ((1ull << slots) - 1) << location;
 }
 
+static uint8_t
+get_num_components(nir_variable *var)
+{
+   if (glsl_type_is_struct(glsl_without_array(var->type)))
+      return 4;
+
+   return glsl_get_vector_elements(glsl_without_array(var->type));
+}
+
 static void
 tcs_add_output_reads(nir_shader *shader, uint64_t *read, uint64_t *patches_read)
 {
@@ -80,12 +89,14 @@ tcs_add_output_reads(nir_shader *shader, uint64_t *read, uint64_t *patches_read)
                continue;
 
             nir_variable *var = nir_deref_instr_get_variable(deref);
-            if (var->data.patch) {
-               patches_read[var->data.location_frac] |=
-                  get_variable_io_mask(var, shader->info.stage);
-            } else {
-               read[var->data.location_frac] |=
-                  get_variable_io_mask(var, shader->info.stage);
+            for (unsigned i = 0; i < get_num_components(var); i++) {
+               if (var->data.patch) {
+                  patches_read[var->data.location_frac + i] |=
+                     get_variable_io_mask(var, shader->info.stage);
+               } else {
+                  read[var->data.location_frac + i] |=
+                     get_variable_io_mask(var, shader->info.stage);
+               }
             }
          }
       }
@@ -161,22 +172,26 @@ nir_remove_unused_varyings(nir_shader *producer, nir_shader *consumer)
    uint64_t patches_read[4] = { 0 }, patches_written[4] = { 0 };
 
    nir_foreach_variable(var, &producer->outputs) {
-      if (var->data.patch) {
-         patches_written[var->data.location_frac] |=
-            get_variable_io_mask(var, producer->info.stage);
-      } else {
-         written[var->data.location_frac] |=
-            get_variable_io_mask(var, producer->info.stage);
+      for (unsigned i = 0; i < get_num_components(var); i++) {
+         if (var->data.patch) {
+            patches_written[var->data.location_frac + i] |=
+               get_variable_io_mask(var, producer->info.stage);
+         } else {
+            written[var->data.location_frac + i] |=
+               get_variable_io_mask(var, producer->info.stage);
+         }
       }
    }
 
    nir_foreach_variable(var, &consumer->inputs) {
-      if (var->data.patch) {
-         patches_read[var->data.location_frac] |=
-            get_variable_io_mask(var, consumer->info.stage);
-      } else {
-         read[var->data.location_frac] |=
-            get_variable_io_mask(var, consumer->info.stage);
+      for (unsigned i = 0; i < get_num_components(var); i++) {
+         if (var->data.patch) {
+            patches_read[var->data.location_frac + i] |=
+               get_variable_io_mask(var, consumer->info.stage);
+         } else {
+            read[var->data.location_frac + i] |=
+               get_variable_io_mask(var, consumer->info.stage);
+         }
       }
    }
 
diff --git a/src/compiler/nir/nir_lower_array_deref_of_vec.c b/src/compiler/nir/nir_lower_array_deref_of_vec.c
new file mode 100644
index 00000000000..2a70dd1ddbc
--- /dev/null
+++ b/src/compiler/nir/nir_lower_array_deref_of_vec.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright © 2019 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir.h"
+#include "nir_builder.h"
+
+static void
+build_write_masked_store(nir_builder *b, nir_deref_instr *vec_deref,
+                         nir_ssa_def *value, unsigned component)
+{
+   assert(value->num_components == 1);
+   unsigned num_components = glsl_get_components(vec_deref->type);
+   assert(num_components > 1 && num_components <= NIR_MAX_VEC_COMPONENTS);
+
+   nir_ssa_def *u = nir_ssa_undef(b, 1, value->bit_size);
+   nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS];
+   for (unsigned i = 0; i < num_components; i++)
+      comps[i] = (i == component) ? value : u;
+
+   nir_ssa_def *vec = nir_vec(b, comps, num_components);
+   nir_store_deref(b, vec_deref, vec, (1u << component));
+}
+
+static void
+build_write_masked_stores(nir_builder *b, nir_deref_instr *vec_deref,
+                          nir_ssa_def *value, nir_ssa_def *index,
+                          unsigned start, unsigned end)
+{
+   if (start == end - 1) {
+      build_write_masked_store(b, vec_deref, value, start);
+   } else {
+      unsigned mid = start + (end - start) / 2;
+      nir_push_if(b, nir_ilt(b, index, nir_imm_int(b, mid)));
+      build_write_masked_stores(b, vec_deref, value, index, start, mid);
+      nir_push_else(b, NULL);
+      build_write_masked_stores(b, vec_deref, value, index, mid, end);
+      nir_pop_if(b, NULL);
+   }
+}
+
+static bool
+nir_lower_array_deref_of_vec_impl(nir_function_impl *impl,
+                                  nir_variable_mode modes,
+                                  nir_lower_array_deref_of_vec_options options)
+{
+   bool progress = false;
+
+   nir_builder b;
+   nir_builder_init(&b, impl);
+
+   nir_foreach_block(block, impl) {
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+         assert(intrin->intrinsic != nir_intrinsic_copy_deref);
+
+         if (intrin->intrinsic != nir_intrinsic_load_deref &&
+             intrin->intrinsic != nir_intrinsic_interp_deref_at_centroid &&
+             intrin->intrinsic != nir_intrinsic_interp_deref_at_sample &&
+             intrin->intrinsic != nir_intrinsic_interp_deref_at_offset &&
+             intrin->intrinsic != nir_intrinsic_store_deref)
+            continue;
+
+         nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+         if (!(deref->mode & modes))
+            continue;
+
+         /* We only care about array derefs that act on vectors */
+         if (deref->deref_type != nir_deref_type_array)
+            continue;
+
+         nir_deref_instr *vec_deref = nir_deref_instr_parent(deref);
+         if (!glsl_type_is_vector(vec_deref->type))
+            continue;
+
+         assert(intrin->num_components == 1);
+         unsigned num_components = glsl_get_components(vec_deref->type);
+         assert(num_components > 1 && num_components <= NIR_MAX_VEC_COMPONENTS);
+
+         b.cursor = nir_after_instr(&intrin->instr);
+
+         if (intrin->intrinsic == nir_intrinsic_store_deref) {
+            assert(intrin->src[1].is_ssa);
+            nir_ssa_def *value = intrin->src[1].ssa;
+
+            if (nir_src_is_const(deref->arr.index)) {
+               if (!(options & nir_lower_direct_array_deref_of_vec_store))
+                  continue;
+
+               unsigned index = nir_src_as_uint(deref->arr.index);
+               /* If index is OOB, we throw the old store away and don't
+                * replace it with anything.
+                */
+               if (index < num_components)
+                  build_write_masked_store(&b, vec_deref, value, index);
+            } else {
+               if (!(options & nir_lower_indirect_array_deref_of_vec_store))
+                  continue;
+
+               nir_ssa_def *index = nir_ssa_for_src(&b, deref->arr.index, 1);
+               build_write_masked_stores(&b, vec_deref, value, index,
+                                         0, num_components);
+            }
+            nir_instr_remove(&intrin->instr);
+
+            progress = true;
+         } else {
+            if (nir_src_is_const(deref->arr.index)) {
+               if (!(options & nir_lower_direct_array_deref_of_vec_load))
+                  continue;
+            } else {
+               if (!(options & nir_lower_indirect_array_deref_of_vec_load))
+                  continue;
+            }
+
+            /* Turn the load into a vector load */
+            nir_instr_rewrite_src(&intrin->instr, &intrin->src[0],
+                                  nir_src_for_ssa(&vec_deref->dest.ssa));
+            intrin->dest.ssa.num_components = num_components;
+            intrin->num_components = num_components;
+
+            nir_ssa_def *index = nir_ssa_for_src(&b, deref->arr.index, 1);
+            nir_ssa_def *scalar =
+               nir_vector_extract(&b, &intrin->dest.ssa, index);
+            if (scalar->parent_instr->type == nir_instr_type_ssa_undef) {
+               nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
+                                        nir_src_for_ssa(scalar));
+               nir_instr_remove(&intrin->instr);
+            } else {
+               nir_ssa_def_rewrite_uses_after(&intrin->dest.ssa,
+                                              nir_src_for_ssa(scalar),
+                                              scalar->parent_instr);
+            }
+            progress = true;
+         }
+      }
+   }
+
+   if (progress) {
+      nir_metadata_preserve(impl, nir_metadata_block_index |
+                                  nir_metadata_dominance);
+   }
+
+   return progress;
+}
+
+/* Lowers away array dereferences on vectors
+ *
+ * These are allowed on certain variable types such as SSBOs and TCS outputs.
+ * However, not everyone can actually handle them everywhere.  There are also
+ * cases where we want to lower them for performance reasons.
+ *
+ * This patch assumes that copy_deref instructions have already been lowered.
+ */
+bool
+nir_lower_array_deref_of_vec(nir_shader *shader, nir_variable_mode modes,
+                             nir_lower_array_deref_of_vec_options options)
+{
+   bool progress = false;
+
+   nir_foreach_function(function, shader) {
+      if (function->impl &&
+          nir_lower_array_deref_of_vec_impl(function->impl, modes, options))
+         progress = true;
+   }
+
+   return progress;
+}
diff --git a/src/compiler/nir/nir_lower_clip_cull_distance_arrays.c b/src/compiler/nir/nir_lower_clip_cull_distance_arrays.c
index 6e1557ef40d..b7cd7c50b11 100644
--- a/src/compiler/nir/nir_lower_clip_cull_distance_arrays.c
+++ b/src/compiler/nir/nir_lower_clip_cull_distance_arrays.c
@@ -27,10 +27,10 @@
 /**
  * @file
  *
- * This pass combines separate clip and cull distance arrays into a
- * single array that contains both.  Clip distances come first, then
- * cull distances.  It also populates nir_shader_info with the size
- * of the original arrays so the driver knows which are which.
+ * This pass combines clip and cull distance arrays in separate locations and
+ * colocates them both in VARYING_SLOT_CLIP_DIST0.  It does so by maintaining
+ * two arrays but making them compact and using location_frac to stack them on
+ * top of each other.
  */
 
 /**
@@ -56,77 +56,6 @@ get_unwrapped_array_length(nir_shader *nir, nir_variable *var)
    return glsl_get_length(type);
 }
 
-/**
- * Update the type of the combined array (including interface block nesting).
- */
-static void
-update_type(nir_variable *var, gl_shader_stage stage, unsigned length)
-{
-   const struct glsl_type *type = glsl_array_type(glsl_float_type(), length, 0);
-
-   if (nir_is_per_vertex_io(var, stage))
-      type = glsl_array_type(type, glsl_get_length(var->type), 0);
-
-   var->type = type;
-}
-
-static void
-rewrite_clip_cull_deref(nir_builder *b,
-                        nir_deref_instr *deref,
-                        const struct glsl_type *type,
-                        unsigned tail_offset)
-{
-   deref->type = type;
-
-   if (glsl_type_is_array(type)) {
-      const struct glsl_type *child_type = glsl_get_array_element(type);
-      nir_foreach_use(src, &deref->dest.ssa) {
-         rewrite_clip_cull_deref(b, nir_instr_as_deref(src->parent_instr),
-                                 child_type, tail_offset);
-      }
-   } else {
-      assert(glsl_type_is_scalar(type));
-
-      /* This is the end of the line.  Add the tail offset if needed */
-      if (tail_offset > 0) {
-         b->cursor = nir_before_instr(&deref->instr);
-         assert(deref->deref_type == nir_deref_type_array);
-         nir_ssa_def *index = nir_iadd(b, deref->arr.index.ssa,
-                                          nir_imm_int(b, tail_offset));
-         nir_instr_rewrite_src(&deref->instr, &deref->arr.index,
-                               nir_src_for_ssa(index));
-      }
-   }
-}
-
-static void
-rewrite_references(nir_builder *b,
-                   nir_instr *instr,
-                   nir_variable *combined,
-                   unsigned cull_offset)
-{
-   if (instr->type != nir_instr_type_deref)
-      return;
-
-   nir_deref_instr *deref = nir_instr_as_deref(instr);
-   if (deref->deref_type != nir_deref_type_var)
-      return;
-
-   if (deref->var->data.mode != combined->data.mode)
-      return;
-
-   const unsigned location = deref->var->data.location;
-   if (location != VARYING_SLOT_CLIP_DIST0 &&
-       location != VARYING_SLOT_CULL_DIST0)
-      return;
-
-   deref->var = combined;
-   if (location == VARYING_SLOT_CULL_DIST0)
-      rewrite_clip_cull_deref(b, deref, combined->type, cull_offset);
-   else
-      rewrite_clip_cull_deref(b, deref, combined->type, 0);
-}
-
 static bool
 combine_clip_cull(nir_shader *nir,
                   struct exec_list *vars,
@@ -134,7 +63,6 @@ combine_clip_cull(nir_shader *nir,
 {
    nir_variable *cull = NULL;
    nir_variable *clip = NULL;
-   bool progress = false;
 
    nir_foreach_variable(var, vars) {
       if (var->data.location == VARYING_SLOT_CLIP_DIST0)
@@ -144,7 +72,9 @@ combine_clip_cull(nir_shader *nir,
          cull = var;
    }
 
-   /* if the GLSL lowering pass has already run, don't bother repeating */
+   if (!cull && !clip)
+      return false;
+
    if (!cull && clip) {
       if (!glsl_type_is_array(clip->type))
          return false;
@@ -158,50 +88,29 @@ combine_clip_cull(nir_shader *nir,
       nir->info.cull_distance_array_size = cull_array_size;
    }
 
-   if (clip)
-      clip->data.compact = true;
-
-   if (cull)
-      cull->data.compact = true;
-
-   if (cull_array_size > 0) {
-      if (clip_array_size == 0) {
-         /* No clip distances, just change the cull distance location */
-         cull->data.location = VARYING_SLOT_CLIP_DIST0;
-      } else {
-         /* Turn the ClipDistance array into a combined one */
-         update_type(clip, nir->info.stage, clip_array_size + cull_array_size);
-
-         /* Rewrite CullDistance to reference the combined array */
-         nir_foreach_function(function, nir) {
-            if (function->impl) {
-               nir_builder b;
-               nir_builder_init(&b, function->impl);
-
-               nir_foreach_block(block, function->impl) {
-                  nir_foreach_instr(instr, block) {
-                     rewrite_references(&b, instr, clip, clip_array_size);
-                  }
-               }
-            }
-         }
-
-         /* Delete the old CullDistance variable */
-         exec_node_remove(&cull->node);
-         ralloc_free(cull);
-      }
+   if (clip) {
+      assert(clip->data.compact);
+      clip->data.how_declared = nir_var_hidden;
+   }
 
-      nir_foreach_function(function, nir) {
-         if (function->impl) {
-            nir_metadata_preserve(function->impl,
-                                  nir_metadata_block_index |
-                                  nir_metadata_dominance);
-         }
+   if (cull) {
+      assert(cull->data.compact);
+      cull->data.how_declared = nir_var_hidden;
+      cull->data.location = VARYING_SLOT_CLIP_DIST0 + clip_array_size / 4;
+      cull->data.location_frac = clip_array_size % 4;
+   }
+
+   nir_foreach_function(function, nir) {
+      if (function->impl) {
+         nir_metadata_preserve(function->impl,
+                               nir_metadata_block_index |
+                               nir_metadata_dominance |
+                               nir_metadata_live_ssa_defs |
+                               nir_metadata_loop_analysis);
       }
-      progress = true;
    }
 
-   return progress;
+   return true;
 }
 
 bool
diff --git a/src/compiler/nir/nir_lower_io_to_temporaries.c b/src/compiler/nir/nir_lower_io_to_temporaries.c
index 7602637d428..d2b069d3d68 100644
--- a/src/compiler/nir/nir_lower_io_to_temporaries.c
+++ b/src/compiler/nir/nir_lower_io_to_temporaries.c
@@ -85,7 +85,8 @@ emit_output_copies_impl(struct lower_io_state *state, nir_function_impl *impl)
                continue;
 
             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
-            if (intrin->intrinsic == nir_intrinsic_emit_vertex) {
+            if (intrin->intrinsic == nir_intrinsic_emit_vertex ||
+                intrin->intrinsic == nir_intrinsic_emit_vertex_with_counter) {
                b.cursor = nir_before_instr(&intrin->instr);
                emit_copies(&b, &state->shader->outputs, &state->old_outputs);
             }
diff --git a/src/compiler/nir/nir_lower_io_to_vector.c b/src/compiler/nir/nir_lower_io_to_vector.c
new file mode 100644
index 00000000000..d979962373d
--- /dev/null
+++ b/src/compiler/nir/nir_lower_io_to_vector.c
@@ -0,0 +1,387 @@
+/*
+ * Copyright © 2019 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir.h"
+#include "nir_builder.h"
+#include "nir_deref.h"
+
+/** @file nir_lower_io_to_vector.c
+ *
+ * Merges compatible input/output variables residing in different components
+ * of the same location. It's expected that further passes such as
+ * nir_lower_io_to_temporaries will combine loads and stores of the merged
+ * variables, producing vector nir_load_input/nir_store_output instructions
+ * when all is said and done.
+ */
+
+static const struct glsl_type *
+resize_array_vec_type(const struct glsl_type *type, unsigned num_components)
+{
+   if (glsl_type_is_array(type)) {
+      const struct glsl_type *arr_elem =
+         resize_array_vec_type(glsl_get_array_element(type), num_components);
+      return glsl_array_type(arr_elem, glsl_get_length(type), 0);
+   } else {
+      assert(glsl_type_is_vector_or_scalar(type));
+      return glsl_vector_type(glsl_get_base_type(type), num_components);
+   }
+}
+
+static bool
+variable_can_rewrite(const nir_variable *var)
+{
+   /* Only touch user defined varyings as these are the only ones we split */
+   if (var->data.location < VARYING_SLOT_VAR0)
+      return false;
+
+   /* Skip complex types we don't split in the first place */
+   if (!glsl_type_is_vector_or_scalar(glsl_without_array(var->type)))
+      return false;
+
+   /* TODO: add 64/16bit support ? */
+   if (glsl_get_bit_size(glsl_without_array(var->type)) != 32)
+      return false;
+
+   return true;
+}
+
+static bool
+variables_can_merge(nir_shader *shader,
+                    const nir_variable *a, const nir_variable *b)
+{
+   const struct glsl_type *a_type_tail = a->type;
+   const struct glsl_type *b_type_tail = b->type;
+
+   /* They must have the same array structure */
+   while (glsl_type_is_array(a_type_tail)) {
+      if (!glsl_type_is_array(b_type_tail))
+         return false;
+
+      if (glsl_get_length(a_type_tail) != glsl_get_length(b_type_tail))
+         return false;
+
+      a_type_tail = glsl_get_array_element(a_type_tail);
+      b_type_tail = glsl_get_array_element(b_type_tail);
+   }
+
+   if (!glsl_type_is_vector_or_scalar(a_type_tail) ||
+       !glsl_type_is_vector_or_scalar(b_type_tail))
+      return false;
+
+   if (glsl_get_base_type(a->type) != glsl_get_base_type(b->type))
+      return false;
+
+   assert(a->data.mode == b->data.mode);
+   if (shader->info.stage == MESA_SHADER_FRAGMENT &&
+       a->data.mode == nir_var_shader_in &&
+       a->data.interpolation != b->data.interpolation)
+      return false;
+
+   return true;
+}
+
+static bool
+create_new_io_vars(nir_shader *shader, struct exec_list *io_list,
+                   nir_variable *old_vars[MAX_VARYINGS_INCL_PATCH][4],
+                   nir_variable *new_vars[MAX_VARYINGS_INCL_PATCH][4])
+{
+   if (exec_list_is_empty(io_list))
+      return false;
+
+   nir_foreach_variable(var, io_list) {
+      if (variable_can_rewrite(var)) {
+         unsigned loc = var->data.location - VARYING_SLOT_VAR0;
+         unsigned frac = var->data.location_frac;
+         old_vars[loc][frac] = var;
+      }
+   }
+
+   bool merged_any_vars = false;
+
+   /* We don't handle combining vars of different type e.g. different array
+    * lengths.
+    */
+   for (unsigned loc = 0; loc < MAX_VARYINGS_INCL_PATCH; loc++) {
+      unsigned frac = 0;
+      while (frac < 4) {
+         nir_variable *first_var = old_vars[loc][frac];
+         if (!first_var) {
+            frac++;
+            continue;
+         }
+
+         int first = frac;
+         bool found_merge = false;
+
+         while (frac < 4) {
+            nir_variable *var = old_vars[loc][frac];
+            if (!var)
+               break;
+
+            if (var != first_var) {
+               if (!variables_can_merge(shader, first_var, var))
+                  break;
+
+               found_merge = true;
+            }
+
+            const unsigned num_components =
+               glsl_get_components(glsl_without_array(var->type));
+
+            /* We had better not have any overlapping vars */
+            for (unsigned i = 1; i < num_components; i++)
+               assert(old_vars[loc][frac + i] == NULL);
+
+            frac += num_components;
+         }
+
+         if (!found_merge)
+            continue;
+
+         merged_any_vars = true;
+
+         nir_variable *var = nir_variable_clone(old_vars[loc][first], shader);
+         var->data.location_frac = first;
+         var->type = resize_array_vec_type(var->type, frac - first);
+
+         nir_shader_add_variable(shader, var);
+         for (unsigned i = first; i < frac; i++)
+            new_vars[loc][i] = var;
+      }
+   }
+
+   return merged_any_vars;
+}
+
+static nir_deref_instr *
+build_array_deref_of_new_var(nir_builder *b, nir_variable *new_var,
+                             nir_deref_instr *leader)
+{
+   if (leader->deref_type == nir_deref_type_var)
+      return nir_build_deref_var(b, new_var);
+
+   nir_deref_instr *parent =
+      build_array_deref_of_new_var(b, new_var, nir_deref_instr_parent(leader));
+
+   return nir_build_deref_follower(b, parent, leader);
+}
+
+static bool
+nir_lower_io_to_vector_impl(nir_function_impl *impl, nir_variable_mode modes)
+{
+   assert(!(modes & ~(nir_var_shader_in | nir_var_shader_out)));
+
+   nir_builder b;
+   nir_builder_init(&b, impl);
+
+   nir_metadata_require(impl, nir_metadata_dominance);
+
+   nir_shader *shader = impl->function->shader;
+   nir_variable *old_inputs[MAX_VARYINGS_INCL_PATCH][4] = {0};
+   nir_variable *new_inputs[MAX_VARYINGS_INCL_PATCH][4] = {0};
+   nir_variable *old_outputs[MAX_VARYINGS_INCL_PATCH][4] = {0};
+   nir_variable *new_outputs[MAX_VARYINGS_INCL_PATCH][4] = {0};
+
+   if (modes & nir_var_shader_in) {
+      /* Vertex shaders support overlapping inputs.  We don't do those */
+      assert(b.shader->info.stage != MESA_SHADER_VERTEX);
+
+      /* If we don't actually merge any variables, remove that bit from modes
+       * so we don't bother doing extra non-work.
+       */
+      if (!create_new_io_vars(shader, &shader->inputs,
+                              old_inputs, new_inputs))
+         modes &= ~nir_var_shader_in;
+   }
+
+   if (modes & nir_var_shader_out) {
+      /* Fragment shader outputs are always vec4.  You shouldn't have
+       * scalarized them and it doesn't make sense to vectorize them.
+       */
+      assert(b.shader->info.stage != MESA_SHADER_FRAGMENT);
+
+      /* If we don't actually merge any variables, remove that bit from modes
+       * so we don't bother doing extra non-work.
+       */
+      if (!create_new_io_vars(shader, &shader->outputs,
+                              old_outputs, new_outputs))
+         modes &= ~nir_var_shader_out;
+   }
+
+   if (!modes)
+      return false;
+
+   bool progress = false;
+
+   /* Actually lower all the IO load/store intrinsics.  Load instructions are
+    * lowered to a vector load and an ALU instruction to grab the channels we
+    * want.  Outputs are lowered to a write-masked store of the vector output.
+    * For non-TCS outputs, we then run nir_lower_io_to_temporaries at the end
+    * to clean up the partial writes.
+    */
+   nir_foreach_block(block, impl) {
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+         switch (intrin->intrinsic) {
+         case nir_intrinsic_load_deref:
+         case nir_intrinsic_interp_deref_at_centroid:
+         case nir_intrinsic_interp_deref_at_sample:
+         case nir_intrinsic_interp_deref_at_offset: {
+            nir_deref_instr *old_deref = nir_src_as_deref(intrin->src[0]);
+            if (!(old_deref->mode & modes))
+               break;
+
+            if (old_deref->mode == nir_var_shader_out)
+               assert(b.shader->info.stage == MESA_SHADER_TESS_CTRL);
+
+            nir_variable *old_var = nir_deref_instr_get_variable(old_deref);
+            if (old_var->data.location < VARYING_SLOT_VAR0)
+               break;
+
+            const unsigned loc = old_var->data.location - VARYING_SLOT_VAR0;
+            const unsigned old_frac = old_var->data.location_frac;
+            nir_variable *new_var = old_deref->mode == nir_var_shader_in ?
+                                    new_inputs[loc][old_frac] :
+                                    new_outputs[loc][old_frac];
+            if (!new_var)
+               break;
+
+            assert(new_var->data.location == VARYING_SLOT_VAR0 + loc);
+            const unsigned new_frac = new_var->data.location_frac;
+
+            nir_component_mask_t vec4_comp_mask =
+               ((1 << intrin->num_components) - 1) << old_frac;
+
+            b.cursor = nir_before_instr(&intrin->instr);
+
+            /* Rewrite the load to use the new variable and only select a
+             * portion of the result.
+             */
+            nir_deref_instr *new_deref =
+               build_array_deref_of_new_var(&b, new_var, old_deref);
+            assert(glsl_type_is_vector(new_deref->type));
+            nir_instr_rewrite_src(&intrin->instr, &intrin->src[0],
+                                  nir_src_for_ssa(&new_deref->dest.ssa));
+
+            intrin->num_components =
+               glsl_get_components(new_deref->type);
+            intrin->dest.ssa.num_components = intrin->num_components;
+
+            b.cursor = nir_after_instr(&intrin->instr);
+
+            nir_ssa_def *new_vec = nir_channels(&b, &intrin->dest.ssa,
+                                                vec4_comp_mask >> new_frac);
+            nir_ssa_def_rewrite_uses_after(&intrin->dest.ssa,
+                                           nir_src_for_ssa(new_vec),
+                                           new_vec->parent_instr);
+
+            progress = true;
+            break;
+         }
+
+         case nir_intrinsic_store_deref: {
+            nir_deref_instr *old_deref = nir_src_as_deref(intrin->src[0]);
+            if (old_deref->mode != nir_var_shader_out)
+               break;
+
+            nir_variable *old_var = nir_deref_instr_get_variable(old_deref);
+            if (old_var->data.location < VARYING_SLOT_VAR0)
+               break;
+
+            const unsigned loc = old_var->data.location - VARYING_SLOT_VAR0;
+            const unsigned old_frac = old_var->data.location_frac;
+            nir_variable *new_var = new_outputs[loc][old_frac];
+            if (!new_var)
+               break;
+
+            assert(new_var->data.location == VARYING_SLOT_VAR0 + loc);
+            const unsigned new_frac = new_var->data.location_frac;
+
+            b.cursor = nir_before_instr(&intrin->instr);
+
+            /* Rewrite the store to be a masked store to the new variable */
+            nir_deref_instr *new_deref =
+               build_array_deref_of_new_var(&b, new_var, old_deref);
+            assert(glsl_type_is_vector(new_deref->type));
+            nir_instr_rewrite_src(&intrin->instr, &intrin->src[0],
+                                  nir_src_for_ssa(&new_deref->dest.ssa));
+
+            intrin->num_components =
+               glsl_get_components(new_deref->type);
+
+            nir_component_mask_t old_wrmask = nir_intrinsic_write_mask(intrin);
+
+            assert(intrin->src[1].is_ssa);
+            nir_ssa_def *old_value = intrin->src[1].ssa;
+            nir_ssa_def *comps[4];
+            for (unsigned c = 0; c < intrin->num_components; c++) {
+               if (new_frac + c >= old_frac &&
+                   (old_wrmask & 1 << (new_frac + c - old_frac))) {
+                  comps[c] = nir_channel(&b, old_value,
+                                         new_frac + c - old_frac);
+               } else {
+                  comps[c] = nir_ssa_undef(&b, old_value->num_components,
+                                               old_value->bit_size);
+               }
+            }
+            nir_ssa_def *new_value = nir_vec(&b, comps, intrin->num_components);
+            nir_instr_rewrite_src(&intrin->instr, &intrin->src[1],
+                                  nir_src_for_ssa(new_value));
+
+            nir_intrinsic_set_write_mask(intrin,
+                                         old_wrmask << (old_frac - new_frac));
+
+            progress = true;
+            break;
+         }
+
+         default:
+            break;
+         }
+      }
+   }
+
+   if (progress) {
+      nir_metadata_preserve(impl, nir_metadata_block_index |
+                                  nir_metadata_dominance);
+   }
+
+   return progress;
+}
+
+bool
+nir_lower_io_to_vector(nir_shader *shader, nir_variable_mode modes)
+{
+   bool progress = false;
+
+   nir_foreach_function(function, shader) {
+      if (function->impl)
+         progress |= nir_lower_io_to_vector_impl(function->impl, modes);
+   }
+
+   return progress;
+}
diff --git a/src/compiler/nir/nir_lower_tex.c b/src/compiler/nir/nir_lower_tex.c
index a618b86b34c..11afffe3dee 100644
--- a/src/compiler/nir/nir_lower_tex.c
+++ b/src/compiler/nir/nir_lower_tex.c
@@ -306,7 +306,8 @@ lower_implicit_lod(nir_builder *b, nir_tex_instr *tex)
 }
 
 static nir_ssa_def *
-sample_plane(nir_builder *b, nir_tex_instr *tex, int plane)
+sample_plane(nir_builder *b, nir_tex_instr *tex, int plane,
+             const nir_lower_tex_options *options)
 {
    assert(tex->dest.is_ssa);
    assert(nir_tex_instr_dest_size(tex) == 4);
@@ -334,6 +335,11 @@ sample_plane(nir_builder *b, nir_tex_instr *tex, int plane)
 
    nir_builder_instr_insert(b, &plane_tex->instr);
 
+   /* If scaling_factor is set, return a scaled value. */
+   if (options->scale_factors[tex->texture_index])
+      return nir_fmul_imm(b, &plane_tex->dest.ssa,
+                          options->scale_factors[tex->texture_index]);
+
    return &plane_tex->dest.ssa;
 }
 
@@ -366,12 +372,13 @@ convert_yuv_to_rgb(nir_builder *b, nir_tex_instr *tex,
 }
 
 static void
-lower_y_uv_external(nir_builder *b, nir_tex_instr *tex)
+lower_y_uv_external(nir_builder *b, nir_tex_instr *tex,
+                    const nir_lower_tex_options *options)
 {
    b->cursor = nir_after_instr(&tex->instr);
 
-   nir_ssa_def *y = sample_plane(b, tex, 0);
-   nir_ssa_def *uv = sample_plane(b, tex, 1);
+   nir_ssa_def *y = sample_plane(b, tex, 0, options);
+   nir_ssa_def *uv = sample_plane(b, tex, 1, options);
 
    convert_yuv_to_rgb(b, tex,
                       nir_channel(b, y, 0),
@@ -381,13 +388,14 @@ lower_y_uv_external(nir_builder *b, nir_tex_instr *tex)
 }
 
 static void
-lower_y_u_v_external(nir_builder *b, nir_tex_instr *tex)
+lower_y_u_v_external(nir_builder *b, nir_tex_instr *tex,
+                     const nir_lower_tex_options *options)
 {
    b->cursor = nir_after_instr(&tex->instr);
 
-   nir_ssa_def *y = sample_plane(b, tex, 0);
-   nir_ssa_def *u = sample_plane(b, tex, 1);
-   nir_ssa_def *v = sample_plane(b, tex, 2);
+   nir_ssa_def *y = sample_plane(b, tex, 0, options);
+   nir_ssa_def *u = sample_plane(b, tex, 1, options);
+   nir_ssa_def *v = sample_plane(b, tex, 2, options);
 
    convert_yuv_to_rgb(b, tex,
                       nir_channel(b, y, 0),
@@ -397,12 +405,13 @@ lower_y_u_v_external(nir_builder *b, nir_tex_instr *tex)
 }
 
 static void
-lower_yx_xuxv_external(nir_builder *b, nir_tex_instr *tex)
+lower_yx_xuxv_external(nir_builder *b, nir_tex_instr *tex,
+                       const nir_lower_tex_options *options)
 {
    b->cursor = nir_after_instr(&tex->instr);
 
-   nir_ssa_def *y = sample_plane(b, tex, 0);
-   nir_ssa_def *xuxv = sample_plane(b, tex, 1);
+   nir_ssa_def *y = sample_plane(b, tex, 0, options);
+   nir_ssa_def *xuxv = sample_plane(b, tex, 1, options);
 
    convert_yuv_to_rgb(b, tex,
                       nir_channel(b, y, 0),
@@ -412,12 +421,13 @@ lower_yx_xuxv_external(nir_builder *b, nir_tex_instr *tex)
 }
 
 static void
-lower_xy_uxvx_external(nir_builder *b, nir_tex_instr *tex)
+lower_xy_uxvx_external(nir_builder *b, nir_tex_instr *tex,
+                       const nir_lower_tex_options *options)
 {
   b->cursor = nir_after_instr(&tex->instr);
 
-  nir_ssa_def *y = sample_plane(b, tex, 0);
-  nir_ssa_def *uxvx = sample_plane(b, tex, 1);
+  nir_ssa_def *y = sample_plane(b, tex, 0, options);
+  nir_ssa_def *uxvx = sample_plane(b, tex, 1, options);
 
   convert_yuv_to_rgb(b, tex,
                      nir_channel(b, y, 1),
@@ -427,11 +437,12 @@ lower_xy_uxvx_external(nir_builder *b, nir_tex_instr *tex)
 }
 
 static void
-lower_ayuv_external(nir_builder *b, nir_tex_instr *tex)
+lower_ayuv_external(nir_builder *b, nir_tex_instr *tex,
+                    const nir_lower_tex_options *options)
 {
   b->cursor = nir_after_instr(&tex->instr);
 
-  nir_ssa_def *ayuv = sample_plane(b, tex, 0);
+  nir_ssa_def *ayuv = sample_plane(b, tex, 0, options);
 
   convert_yuv_to_rgb(b, tex,
                      nir_channel(b, ayuv, 2),
@@ -879,6 +890,25 @@ lower_tex_packing(nir_builder *b, nir_tex_instr *tex,
                                   color->parent_instr);
 }
 
+static bool
+sampler_index_lt(nir_tex_instr *tex, unsigned max)
+{
+   assert(nir_tex_instr_src_index(tex, nir_tex_src_sampler_deref) == -1);
+
+   unsigned sampler_index = tex->sampler_index;
+
+   int sampler_offset_idx =
+      nir_tex_instr_src_index(tex, nir_tex_src_sampler_offset);
+   if (sampler_offset_idx >= 0) {
+      if (!nir_src_is_const(tex->src[sampler_offset_idx].src))
+         return false;
+
+      sampler_index += nir_src_as_uint(tex->src[sampler_offset_idx].src);
+   }
+
+   return sampler_index < max;
+}
+
 static bool
 nir_lower_tex_block(nir_block *block, nir_builder *b,
                     const nir_lower_tex_options *options)
@@ -923,27 +953,27 @@ nir_lower_tex_block(nir_block *block, nir_builder *b,
       }
 
       if ((1 << tex->texture_index) & options->lower_y_uv_external) {
-         lower_y_uv_external(b, tex);
+         lower_y_uv_external(b, tex, options);
          progress = true;
       }
 
       if ((1 << tex->texture_index) & options->lower_y_u_v_external) {
-         lower_y_u_v_external(b, tex);
+         lower_y_u_v_external(b, tex, options);
          progress = true;
       }
 
       if ((1 << tex->texture_index) & options->lower_yx_xuxv_external) {
-         lower_yx_xuxv_external(b, tex);
+         lower_yx_xuxv_external(b, tex, options);
          progress = true;
       }
 
       if ((1 << tex->texture_index) & options->lower_xy_uxvx_external) {
-         lower_xy_uxvx_external(b, tex);
+         lower_xy_uxvx_external(b, tex, options);
          progress = true;
       }
 
       if ((1 << tex->texture_index) & options->lower_ayuv_external) {
-         lower_ayuv_external(b, tex);
+         lower_ayuv_external(b, tex, options);
          progress = true;
       }
 
@@ -995,6 +1025,8 @@ nir_lower_tex_block(nir_block *block, nir_builder *b,
            (options->lower_txd_shadow && tex->is_shadow) ||
            (options->lower_txd_shadow_clamp && tex->is_shadow && has_min_lod) ||
            (options->lower_txd_offset_clamp && has_offset && has_min_lod) ||
+           (options->lower_txd_clamp_if_sampler_index_not_lt_16 &&
+            has_min_lod && !sampler_index_lt(tex, 16)) ||
            (options->lower_txd_cube_map &&
             tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE) ||
            (options->lower_txd_3d &&
diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py
index d32005846a6..f52e623ef0f 100644
--- a/src/compiler/nir/nir_opcodes.py
+++ b/src/compiler/nir/nir_opcodes.py
@@ -404,12 +404,21 @@ def unpack_4x8(fmt):
 float absX = fabs(src0.x);
 float absY = fabs(src0.y);
 float absZ = fabs(src0.z);
-if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.y; dst.y = -src0.z; }
-if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = -src0.y; dst.y = src0.z; }
-if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.z; dst.y = src0.x; }
-if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = -src0.z; dst.y = src0.x; }
-if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.y; dst.y = src0.x; }
-if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.y; dst.y = -src0.x; }
+
+float ma = 0.0;
+if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
+if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
+if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
+
+if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
+if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
+if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
+if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
+if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
+if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
+
+dst.x = dst.x / ma + 0.5;
+dst.y = dst.y / ma + 0.5;
 """)
 
 unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """
diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py
index 75a3d2ad238..53c842b9ef9 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -618,11 +618,11 @@
    # Reassociate constants in add/mul chains so they can be folded together.
    # For now, we mostly only handle cases where the constants are separated by
    # a single non-constant.  We could do better eventually.
-   (('~fmul', '#a', ('fmul', b, '#c')), ('fmul', ('fmul', a, c), b)),
-   (('imul', '#a', ('imul', b, '#c')), ('imul', ('imul', a, c), b)),
-   (('~fadd', '#a',          ('fadd', b, '#c')),  ('fadd', ('fadd', a,          c),           b)),
-   (('~fadd', '#a', ('fneg', ('fadd', b, '#c'))), ('fadd', ('fadd', a, ('fneg', c)), ('fneg', b))),
-   (('iadd', '#a', ('iadd', b, '#c')), ('iadd', ('iadd', a, c), b)),
+   (('~fmul', '#a', ('fmul', 'b(is_not_const)', '#c')), ('fmul', ('fmul', a, c), b)),
+   (('imul', '#a', ('imul', 'b(is_not_const)', '#c')), ('imul', ('imul', a, c), b)),
+   (('~fadd', '#a',          ('fadd', 'b(is_not_const)', '#c')),  ('fadd', ('fadd', a,          c),           b)),
+   (('~fadd', '#a', ('fneg', ('fadd', 'b(is_not_const)', '#c'))), ('fadd', ('fadd', a, ('fneg', c)), ('fneg', b))),
+   (('iadd', '#a', ('iadd', 'b(is_not_const)', '#c')), ('iadd', ('iadd', a, c), b)),
 
    # By definition...
    (('bcsel', ('ige', ('find_lsb', a), 0), ('find_lsb', a), -1), ('find_lsb', a)),
@@ -929,9 +929,6 @@ def bitfield_reverse(u):
    (('fdot4', a, b), ('fdot_replicated4', a, b), 'options->fdot_replicates'),
    (('fdph', a, b), ('fdph_replicated', a, b), 'options->fdot_replicates'),
 
-   (('b2f(is_used_more_than_once)', ('inot', 'a@1')), ('bcsel', a, 0.0, 1.0)),
-   (('fneg(is_used_more_than_once)', ('b2f', ('inot', 'a@1'))), ('bcsel', a, -0.0, -1.0)),
-
    # we do these late so that we don't get in the way of creating ffmas
    (('fmin', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmin', a, b))),
    (('fmax', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmax', a, b))),
diff --git a/src/compiler/nir/nir_opt_copy_prop_vars.c b/src/compiler/nir/nir_opt_copy_prop_vars.c
index 392fef407cf..a71cce19a80 100644
--- a/src/compiler/nir/nir_opt_copy_prop_vars.c
+++ b/src/compiler/nir/nir_opt_copy_prop_vars.c
@@ -653,7 +653,7 @@ copy_prop_vars_block(struct copy_prop_var_state *state,
 
          struct copy_entry *src_entry =
             lookup_entry_for_deref(copies, src, nir_derefs_a_contains_b_bit);
-         struct value value;
+         struct value value = {0};
          if (try_load_from_entry(state, src_entry, b, intrin, src, &value)) {
             if (value.is_ssa) {
                /* lookup_load has already ensured that we get a single SSA
diff --git a/src/compiler/nir/nir_opt_idiv_const.c b/src/compiler/nir/nir_opt_idiv_const.c
index 7fa739161ba..3e4b7a42d42 100644
--- a/src/compiler/nir/nir_opt_idiv_const.c
+++ b/src/compiler/nir/nir_opt_idiv_const.c
@@ -65,15 +65,17 @@ build_umod(nir_builder *b, nir_ssa_def *n, uint64_t d)
 static nir_ssa_def *
 build_idiv(nir_builder *b, nir_ssa_def *n, int64_t d)
 {
+   uint64_t abs_d = d < 0 ? -d : d;
+
    if (d == 0) {
       return nir_imm_intN_t(b, 0, n->bit_size);
    } else if (d == 1) {
       return n;
    } else if (d == -1) {
       return nir_ineg(b, n);
-   } else if (util_is_power_of_two_or_zero64(d)) {
-      uint64_t abs_d = d < 0 ? -d : d;
-      nir_ssa_def *uq = nir_ishr(b, n, nir_imm_int(b, util_logbase2_64(abs_d)));
+   } else if (util_is_power_of_two_or_zero64(abs_d)) {
+      nir_ssa_def *uq = nir_ushr(b, nir_iabs(b, n),
+                                    nir_imm_int(b, util_logbase2_64(abs_d)));
       nir_ssa_def *n_neg = nir_ilt(b, n, nir_imm_intN_t(b, 0, n->bit_size));
       nir_ssa_def *neg = d < 0 ? nir_inot(b, n_neg) : n_neg;
       return nir_bcsel(b, neg, nir_ineg(b, uq), uq);
diff --git a/src/compiler/nir/nir_opt_if.c b/src/compiler/nir/nir_opt_if.c
index c2f945d4d59..ba94807bb20 100644
--- a/src/compiler/nir/nir_opt_if.c
+++ b/src/compiler/nir/nir_opt_if.c
@@ -313,6 +313,13 @@ opt_if_loop_last_continue(nir_loop *loop)
    if (!then_ends_in_continue && !else_ends_in_continue)
       return false;
 
+   /* if the block after the if/else is empty we bail, otherwise we might end
+    * up looping forever
+    */
+   if (&nif->cf_node == nir_cf_node_prev(&last_block->cf_node) &&
+       exec_list_is_empty(&last_block->instr_list))
+      return false;
+
    /* Move the last block of the loop inside the last if-statement */
    nir_cf_list tmp;
    nir_cf_extract(&tmp, nir_after_cf_node(if_node),
diff --git a/src/compiler/nir/nir_opt_peephole_select.c b/src/compiler/nir/nir_opt_peephole_select.c
index 1deb02a380e..32d337f99dd 100644
--- a/src/compiler/nir/nir_opt_peephole_select.c
+++ b/src/compiler/nir/nir_opt_peephole_select.c
@@ -59,8 +59,7 @@
 
 static bool
 block_check_for_allowed_instrs(nir_block *block, unsigned *count,
-                               bool alu_ok, bool indirect_load_ok,
-                               bool expensive_alu_ok)
+                               bool alu_ok, bool indirect_load_ok)
 {
    nir_foreach_instr(instr, block) {
       switch (instr->type) {
@@ -118,25 +117,6 @@ block_check_for_allowed_instrs(nir_block *block, unsigned *count,
          case nir_op_vec3:
          case nir_op_vec4:
             break;
-
-         case nir_op_fcos:
-         case nir_op_fdiv:
-         case nir_op_fexp2:
-         case nir_op_flog2:
-         case nir_op_fmod:
-         case nir_op_fpow:
-         case nir_op_frcp:
-         case nir_op_frem:
-         case nir_op_frsq:
-         case nir_op_fsin:
-         case nir_op_idiv:
-         case nir_op_irem:
-         case nir_op_udiv:
-            if (!alu_ok || !expensive_alu_ok)
-               return false;
-
-            break;
-
          default:
             if (!alu_ok) {
                /* It must be a move-like operation. */
@@ -180,8 +160,7 @@ block_check_for_allowed_instrs(nir_block *block, unsigned *count,
 
 static bool
 nir_opt_peephole_select_block(nir_block *block, nir_shader *shader,
-                              unsigned limit, bool indirect_load_ok,
-                              bool expensive_alu_ok)
+                              unsigned limit, bool indirect_load_ok)
 {
    if (nir_cf_node_is_first(&block->cf_node))
       return false;
@@ -202,9 +181,9 @@ nir_opt_peephole_select_block(nir_block *block, nir_shader *shader,
    /* ... and those blocks must only contain "allowed" instructions. */
    unsigned count = 0;
    if (!block_check_for_allowed_instrs(then_block, &count, limit != 0,
-                                       indirect_load_ok, expensive_alu_ok) ||
+                                       indirect_load_ok) ||
        !block_check_for_allowed_instrs(else_block, &count, limit != 0,
-                                       indirect_load_ok, expensive_alu_ok))
+                                       indirect_load_ok))
       return false;
 
    if (count > limit)
@@ -271,15 +250,14 @@ nir_opt_peephole_select_block(nir_block *block, nir_shader *shader,
 
 static bool
 nir_opt_peephole_select_impl(nir_function_impl *impl, unsigned limit,
-                             bool indirect_load_ok, bool expensive_alu_ok)
+                             bool indirect_load_ok)
 {
    nir_shader *shader = impl->function->shader;
    bool progress = false;
 
    nir_foreach_block_safe(block, impl) {
       progress |= nir_opt_peephole_select_block(block, shader, limit,
-                                                indirect_load_ok,
-                                                expensive_alu_ok);
+                                                indirect_load_ok);
    }
 
    if (progress) {
@@ -295,15 +273,14 @@ nir_opt_peephole_select_impl(nir_function_impl *impl, unsigned limit,
 
 bool
 nir_opt_peephole_select(nir_shader *shader, unsigned limit,
-                        bool indirect_load_ok, bool expensive_alu_ok)
+                        bool indirect_load_ok)
 {
    bool progress = false;
 
    nir_foreach_function(function, shader) {
       if (function->impl)
          progress |= nir_opt_peephole_select_impl(function->impl, limit,
-                                                  indirect_load_ok,
-                                                  expensive_alu_ok);
+                                                  indirect_load_ok);
    }
 
    return progress;
diff --git a/src/compiler/nir/nir_print.c b/src/compiler/nir/nir_print.c
index 80bc25fde9a..422249677b7 100644
--- a/src/compiler/nir/nir_print.c
+++ b/src/compiler/nir/nir_print.c
@@ -812,8 +812,8 @@ print_intrinsic_instr(nir_intrinsic_instr *instr, print_state *state)
          assert(dim < ARRAY_SIZE(dim_name) && dim_name[dim]);
          fprintf(fp, " image_dim=%s", dim_name[dim]);
       } else if (idx == NIR_INTRINSIC_IMAGE_ARRAY) {
-         bool array = nir_intrinsic_image_dim(instr);
-         fprintf(fp, " image_dim=%s", array ? "true" : "false");
+         bool array = nir_intrinsic_image_array(instr);
+         fprintf(fp, " image_array=%s", array ? "true" : "false");
       } else if (idx == NIR_INTRINSIC_DESC_TYPE) {
          VkDescriptorType desc_type = nir_intrinsic_desc_type(instr);
          fprintf(fp, " desc_type=%s", vulkan_descriptor_type_name(desc_type));
diff --git a/src/compiler/nir/nir_repair_ssa.c b/src/compiler/nir/nir_repair_ssa.c
index b4d22d91c27..f182818374d 100644
--- a/src/compiler/nir/nir_repair_ssa.c
+++ b/src/compiler/nir/nir_repair_ssa.c
@@ -77,6 +77,15 @@ repair_ssa_def(nir_ssa_def *def, void *void_state)
       }
    }
 
+   nir_foreach_if_use(src, def) {
+      nir_block *block_before_if =
+         nir_cf_node_as_block(nir_cf_node_prev(&src->parent_if->cf_node));
+      if (!nir_block_dominates(def->parent_instr->block, block_before_if)) {
+         is_valid = false;
+         break;
+      }
+   }
+
    if (is_valid)
       return true;
 
@@ -98,6 +107,15 @@ repair_ssa_def(nir_ssa_def *def, void *void_state)
       }
    }
 
+   nir_foreach_if_use_safe(src, def) {
+      nir_block *block_before_if =
+         nir_cf_node_as_block(nir_cf_node_prev(&src->parent_if->cf_node));
+      if (!nir_block_dominates(def->parent_instr->block, block_before_if)) {
+         nir_if_rewrite_condition(src->parent_if, nir_src_for_ssa(
+            nir_phi_builder_value_get_block_def(val, block_before_if)));
+      }
+   }
+
    return true;
 }
 
diff --git a/src/compiler/nir/nir_search_helpers.h b/src/compiler/nir/nir_search_helpers.h
index 89f1cba5c52..1fb450752ad 100644
--- a/src/compiler/nir/nir_search_helpers.h
+++ b/src/compiler/nir/nir_search_helpers.h
@@ -116,22 +116,6 @@ is_not_const(nir_alu_instr *instr, unsigned src, UNUSED unsigned num_components,
    return !nir_src_is_const(instr->src[src].src);
 }
 
-static inline bool
-is_used_more_than_once(nir_alu_instr *instr)
-{
-   bool zero_if_use = list_empty(&instr->dest.dest.ssa.if_uses);
-   bool zero_use = list_empty(&instr->dest.dest.ssa.uses);
-
-   if (zero_use && zero_if_use)
-      return false;
-   else if (zero_use && list_is_singular(&instr->dest.dest.ssa.if_uses))
-      return false;
-   else if (zero_if_use && list_is_singular(&instr->dest.dest.ssa.uses))
-      return false;
-
-   return true;
-}
-
 static inline bool
 is_used_once(nir_alu_instr *instr)
 {
diff --git a/src/compiler/nir_types.cpp b/src/compiler/nir_types.cpp
index b4bde5470c0..3a406e99769 100644
--- a/src/compiler/nir_types.cpp
+++ b/src/compiler/nir_types.cpp
@@ -326,6 +326,12 @@ glsl_type_is_integer(const struct glsl_type *type)
    return type->is_integer();
 }
 
+bool
+glsl_type_contains_64bit(const struct glsl_type *type)
+{
+   return type->contains_64bit();
+}
+
 const glsl_type *
 glsl_void_type(void)
 {
diff --git a/src/compiler/nir_types.h b/src/compiler/nir_types.h
index 40cddf76374..eb5cdf0a089 100644
--- a/src/compiler/nir_types.h
+++ b/src/compiler/nir_types.h
@@ -97,37 +97,7 @@ unsigned glsl_atomic_size(const struct glsl_type *type);
 static inline unsigned
 glsl_get_bit_size(const struct glsl_type *type)
 {
-   switch (glsl_get_base_type(type)) {
-   case GLSL_TYPE_BOOL:
-      return 1;
-
-   case GLSL_TYPE_INT:
-   case GLSL_TYPE_UINT:
-   case GLSL_TYPE_FLOAT: /* TODO handle mediump */
-   case GLSL_TYPE_SUBROUTINE:
-      return 32;
-
-   case GLSL_TYPE_FLOAT16:
-   case GLSL_TYPE_UINT16:
-   case GLSL_TYPE_INT16:
-      return 16;
-
-   case GLSL_TYPE_UINT8:
-   case GLSL_TYPE_INT8:
-      return 8;
-
-   case GLSL_TYPE_DOUBLE:
-   case GLSL_TYPE_INT64:
-   case GLSL_TYPE_UINT64:
-   case GLSL_TYPE_IMAGE:
-   case GLSL_TYPE_SAMPLER:
-      return 64;
-
-   default:
-      unreachable("unknown base type");
-   }
-
-   return 0;
+   return glsl_base_type_get_bit_size(glsl_get_base_type(type));
 }
 
 bool glsl_type_is_16bit(const struct glsl_type *type);
@@ -149,6 +119,7 @@ bool glsl_type_is_dual_slot(const struct glsl_type *type);
 bool glsl_type_is_numeric(const struct glsl_type *type);
 bool glsl_type_is_boolean(const struct glsl_type *type);
 bool glsl_type_is_integer(const struct glsl_type *type);
+bool glsl_type_contains_64bit(const struct glsl_type *type);
 bool glsl_sampler_type_is_shadow(const struct glsl_type *type);
 bool glsl_sampler_type_is_array(const struct glsl_type *type);
 bool glsl_contains_atomic(const struct glsl_type *type);
diff --git a/src/compiler/shader_info.h b/src/compiler/shader_info.h
index c3dbe764961..e82f465b256 100644
--- a/src/compiler/shader_info.h
+++ b/src/compiler/shader_info.h
@@ -36,6 +36,8 @@ struct spirv_supported_capabilities {
    bool address;
    bool atomic_storage;
    bool descriptor_array_dynamic_indexing;
+   bool descriptor_array_non_uniform_indexing;
+   bool descriptor_indexing;
    bool device_group;
    bool draw_parameters;
    bool float64;
diff --git a/src/compiler/spirv/spirv_to_nir.c b/src/compiler/spirv/spirv_to_nir.c
index 9bfe5805919..f76cac88f18 100644
--- a/src/compiler/spirv/spirv_to_nir.c
+++ b/src/compiler/spirv/spirv_to_nir.c
@@ -494,6 +494,7 @@ vtn_handle_decoration(struct vtn_builder *b, SpvOp opcode,
       break;
 
    case SpvOpDecorate:
+   case SpvOpDecorateId:
    case SpvOpMemberDecorate:
    case SpvOpDecorateStringGOOGLE:
    case SpvOpMemberDecorateStringGOOGLE:
@@ -503,6 +504,7 @@ vtn_handle_decoration(struct vtn_builder *b, SpvOp opcode,
       struct vtn_decoration *dec = rzalloc(b, struct vtn_decoration);
       switch (opcode) {
       case SpvOpDecorate:
+      case SpvOpDecorateId:
       case SpvOpDecorateStringGOOGLE:
          dec->scope = VTN_DEC_DECORATION;
          break;
@@ -2155,6 +2157,7 @@ vtn_handle_texture(struct vtn_builder *b, SpvOp opcode,
    case nir_texop_txl:
    case nir_texop_txd:
    case nir_texop_tg4:
+   case nir_texop_lod:
       /* These operations require a sampler */
       p->src = nir_src_for_ssa(&sampler->dest.ssa);
       p->src_type = nir_tex_src_sampler_deref;
@@ -2163,7 +2166,6 @@ vtn_handle_texture(struct vtn_builder *b, SpvOp opcode,
    case nir_texop_txf:
    case nir_texop_txf_ms:
    case nir_texop_txs:
-   case nir_texop_lod:
    case nir_texop_query_levels:
    case nir_texop_texture_samples:
    case nir_texop_samples_identical:
@@ -3045,12 +3047,7 @@ nir_ssa_def *
 vtn_vector_extract_dynamic(struct vtn_builder *b, nir_ssa_def *src,
                            nir_ssa_def *index)
 {
-   nir_ssa_def *dest = vtn_vector_extract(b, src, 0);
-   for (unsigned i = 1; i < src->num_components; i++)
-      dest = nir_bcsel(&b->nb, nir_ieq_imm(&b->nb, index, i),
-                       vtn_vector_extract(b, src, i), dest);
-
-   return dest;
+   return nir_vector_extract(&b->nb, src, nir_i2i(&b->nb, index, 32));
 }
 
 nir_ssa_def *
@@ -3595,6 +3592,7 @@ vtn_handle_preamble_instruction(struct vtn_builder *b, SpvOp opcode,
 
       case SpvCapabilityInt64Atomics:
          spv_check_supported(int64_atomics, cap);
+         break;
 
       case SpvCapabilityInt8:
          spv_check_supported(int8, cap);
@@ -3703,12 +3701,26 @@ vtn_handle_preamble_instruction(struct vtn_builder *b, SpvOp opcode,
          spv_check_supported(storage_8bit, cap);
          break;
 
+      case SpvCapabilityShaderNonUniformEXT:
+         spv_check_supported(descriptor_indexing, cap);
+         break;
+
       case SpvCapabilityInputAttachmentArrayDynamicIndexingEXT:
       case SpvCapabilityUniformTexelBufferArrayDynamicIndexingEXT:
       case SpvCapabilityStorageTexelBufferArrayDynamicIndexingEXT:
          spv_check_supported(descriptor_array_dynamic_indexing, cap);
          break;
 
+      case SpvCapabilityUniformBufferArrayNonUniformIndexingEXT:
+      case SpvCapabilitySampledImageArrayNonUniformIndexingEXT:
+      case SpvCapabilityStorageBufferArrayNonUniformIndexingEXT:
+      case SpvCapabilityStorageImageArrayNonUniformIndexingEXT:
+      case SpvCapabilityInputAttachmentArrayNonUniformIndexingEXT:
+      case SpvCapabilityUniformTexelBufferArrayNonUniformIndexingEXT:
+      case SpvCapabilityStorageTexelBufferArrayNonUniformIndexingEXT:
+         spv_check_supported(descriptor_array_non_uniform_indexing, cap);
+         break;
+
       case SpvCapabilityRuntimeDescriptorArrayEXT:
          spv_check_supported(runtime_descriptor_array, cap);
          break;
@@ -3764,6 +3776,7 @@ vtn_handle_preamble_instruction(struct vtn_builder *b, SpvOp opcode,
    case SpvOpExecutionMode:
    case SpvOpDecorationGroup:
    case SpvOpDecorate:
+   case SpvOpDecorateId:
    case SpvOpMemberDecorate:
    case SpvOpGroupDecorate:
    case SpvOpGroupMemberDecorate:
@@ -3951,6 +3964,7 @@ vtn_handle_variable_or_type_instruction(struct vtn_builder *b, SpvOp opcode,
    case SpvOpMemberName:
    case SpvOpDecorationGroup:
    case SpvOpDecorate:
+   case SpvOpDecorateId:
    case SpvOpMemberDecorate:
    case SpvOpGroupDecorate:
    case SpvOpGroupMemberDecorate:
@@ -4480,20 +4494,35 @@ spirv_to_nir(const uint32_t *words, size_t word_count,
       }
    } while (progress);
 
+   vtn_assert(b->entry_point->value_type == vtn_value_type_function);
+   nir_function *entry_point = b->entry_point->func->impl->function;
+   vtn_assert(entry_point);
+
+   entry_point->is_entrypoint = true;
+
+   /* When multiple shader stages exist in the same SPIR-V module, we
+    * generate input and output variables for every stage, in the same
+    * NIR program.  These dead variables can be invalid NIR.  For example,
+    * TCS outputs must be per-vertex arrays (or decorated 'patch'), while
+    * VS output variables wouldn't be.
+    *
+    * To ensure we have valid NIR, we eliminate any dead inputs and outputs
+    * right away.  In order to do so, we must lower any constant initializers
+    * on outputs so nir_remove_dead_variables sees that they're written to.
+    */
+   nir_lower_constant_initializers(b->shader, nir_var_shader_out);
+   nir_remove_dead_variables(b->shader,
+                             nir_var_shader_in | nir_var_shader_out);
+
    /* We sometimes generate bogus derefs that, while never used, give the
     * validator a bit of heartburn.  Run dead code to get rid of them.
     */
    nir_opt_dce(b->shader);
 
-   vtn_assert(b->entry_point->value_type == vtn_value_type_function);
-   nir_function *entry_point = b->entry_point->func->impl->function;
-   vtn_assert(entry_point);
-
    /* Unparent the shader from the vtn_builder before we delete the builder */
    ralloc_steal(NULL, b->shader);
 
    ralloc_free(b);
 
-   entry_point->is_entrypoint = true;
    return entry_point;
 }
diff --git a/src/compiler/spirv/vtn_variables.c b/src/compiler/spirv/vtn_variables.c
index ecdfd0c735f..fe5340ab8cf 100644
--- a/src/compiler/spirv/vtn_variables.c
+++ b/src/compiler/spirv/vtn_variables.c
@@ -1444,6 +1444,8 @@ apply_var_decoration(struct vtn_builder *b,
       switch (builtin) {
       case SpvBuiltInTessLevelOuter:
       case SpvBuiltInTessLevelInner:
+      case SpvBuiltInClipDistance:
+      case SpvBuiltInCullDistance:
          var_data->compact = true;
          break;
       case SpvBuiltInFragCoord:
@@ -2442,9 +2444,17 @@ vtn_handle_variables(struct vtn_builder *b, SpvOp opcode,
    case SpvOpArrayLength: {
       struct vtn_pointer *ptr =
          vtn_value(b, w[3], vtn_value_type_pointer)->pointer;
+      const uint32_t field = w[4];
 
-      const uint32_t offset = ptr->var->type->offsets[w[4]];
-      const uint32_t stride = ptr->var->type->members[w[4]]->stride;
+      vtn_fail_if(ptr->type->base_type != vtn_base_type_struct,
+                  "OpArrayLength must take a pointer to a structure type");
+      vtn_fail_if(field != ptr->type->length - 1 ||
+                  ptr->type->members[field]->base_type != vtn_base_type_array,
+                  "OpArrayLength must reference the last memeber of the "
+                  "structure and that must be an array");
+
+      const uint32_t offset = ptr->type->offsets[field];
+      const uint32_t stride = ptr->type->members[field]->stride;
 
       if (!ptr->block_index) {
          struct vtn_access_chain chain = {
diff --git a/src/egl/Android.mk b/src/egl/Android.mk
index 42b391e6d86..3c7f1366e34 100644
--- a/src/egl/Android.mk
+++ b/src/egl/Android.mk
@@ -59,11 +59,22 @@ LOCAL_SHARED_LIBRARIES := \
 	libcutils \
 	libsync
 
+ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0)
+LOCAL_C_INCLUDES += \
+	frameworks/native/libs/nativewindow/include \
+	frameworks/native/libs/arect/include
+LOCAL_HEADER_LIBRARIES += libnativebase_headers
+endif
+
 ifeq ($(BOARD_USES_DRM_GRALLOC),true)
 	LOCAL_CFLAGS += -DHAVE_DRM_GRALLOC
 	LOCAL_SHARED_LIBRARIES += libgralloc_drm
 endif
 
+ifeq ($(strip $(BOARD_USES_GRALLOC1)),true)
+LOCAL_CFLAGS += -DHAVE_GRALLOC1
+endif
+
 ifeq ($(filter $(MESA_ANDROID_MAJOR_VERSION), 4 5 6 7),)
 LOCAL_SHARED_LIBRARIES += libnativewindow
 endif
@@ -81,6 +92,6 @@ endif
 
 LOCAL_MODULE := libGLES_mesa
 LOCAL_MODULE_RELATIVE_PATH := egl
-
+LOCAL_CFLAGS += -Wno-error
 include $(MESA_COMMON_MK)
 include $(BUILD_SHARED_LIBRARY)
diff --git a/src/egl/drivers/dri2/egl_dri2.c b/src/egl/drivers/dri2/egl_dri2.c
index c98b9a5d18a..ca26e34daa3 100644
--- a/src/egl/drivers/dri2/egl_dri2.c
+++ b/src/egl/drivers/dri2/egl_dri2.c
@@ -66,6 +66,20 @@
 #include "util/u_vector.h"
 #include "mapi/glapi/glapi.h"
 
+/* Additional definitions not yet in the drm_fourcc.h.
+ */
+#ifndef DRM_FORMAT_P010
+#define DRM_FORMAT_P010 	 fourcc_code('P', '0', '1', '0') /* 2x2 subsampled Cb:Cr plane 10 bits per channel */
+#endif
+
+#ifndef DRM_FORMAT_P012
+#define DRM_FORMAT_P012 	 fourcc_code('P', '0', '1', '2') /* 2x2 subsampled Cb:Cr plane 12 bits per channel */
+#endif
+
+#ifndef DRM_FORMAT_P016
+#define DRM_FORMAT_P016 	 fourcc_code('P', '0', '1', '6') /* 2x2 subsampled Cb:Cr plane 16 bits per channel */
+#endif
+
 #define NUM_ATTRIBS 12
 
 static void
@@ -199,8 +213,10 @@ dri2_add_config(_EGLDisplay *disp, const __DRIconfig *dri_config, int id,
    bind_to_texture_rgb = 0;
    bind_to_texture_rgba = 0;
 
-   for (int i = 0; dri2_dpy->core->indexConfigAttrib(dri_config, i, &attrib,
-                                                     &value); ++i) {
+   for (int i = 0; i < __DRI_ATTRIB_MAX; ++i) {
+      if (!dri2_dpy->core->indexConfigAttrib(dri_config, i, &attrib, &value))
+         break;
+
       switch (attrib) {
       case __DRI_ATTRIB_RENDER_TYPE:
          if (value & __DRI_ATTRIB_RGBA_BIT)
@@ -2262,6 +2278,9 @@ dri2_num_fourcc_format_planes(EGLint format)
    case DRM_FORMAT_NV21:
    case DRM_FORMAT_NV16:
    case DRM_FORMAT_NV61:
+   case DRM_FORMAT_P010:
+   case DRM_FORMAT_P012:
+   case DRM_FORMAT_P016:
       return 2;
 
    case DRM_FORMAT_YUV410:
diff --git a/src/egl/drivers/dri2/egl_dri2.h b/src/egl/drivers/dri2/egl_dri2.h
index a9ddadf11b1..4e80deb2038 100644
--- a/src/egl/drivers/dri2/egl_dri2.h
+++ b/src/egl/drivers/dri2/egl_dri2.h
@@ -69,6 +69,10 @@ struct zwp_linux_dmabuf_v1;
 #include <hardware/gralloc.h>
 #endif /* HAVE_ANDROID_PLATFORM */
 
+#ifdef HAVE_GRALLOC1
+#include <hardware/gralloc1.h>
+#endif
+
 #include "eglconfig.h"
 #include "eglcontext.h"
 #include "egldevice.h"
@@ -238,7 +242,14 @@ struct dri2_egl_display
 #endif
 
 #ifdef HAVE_ANDROID_PLATFORM
-   const gralloc_module_t *gralloc;
+   const hw_module_t *gralloc;
+   uint16_t gralloc_version;
+#ifdef HAVE_GRALLOC1
+   gralloc1_device_t *gralloc1_dvc;
+   GRALLOC1_PFN_LOCK_FLEX pfn_lockflex;
+   GRALLOC1_PFN_GET_FORMAT pfn_getFormat;
+   GRALLOC1_PFN_UNLOCK pfn_unlock;
+#endif
 #endif
 
    bool                      is_render_node;
diff --git a/src/egl/drivers/dri2/platform_android.c b/src/egl/drivers/dri2/platform_android.c
index 366a9ec14e9..a08723625fb 100644
--- a/src/egl/drivers/dri2/platform_android.c
+++ b/src/egl/drivers/dri2/platform_android.c
@@ -49,6 +49,8 @@
 
 #define ALIGN(val, align)	(((val) + (align) - 1) & ~((align) - 1))
 
+#define GRALLOC_DRM_GET_FORMAT   1
+
 struct droid_yuv_format {
    /* Lookup keys */
    int native; /* HAL_PIXEL_FORMAT_ */
@@ -59,14 +61,26 @@ struct droid_yuv_format {
    int fourcc; /* __DRI_IMAGE_FOURCC_ */
 };
 
+/* This enumeration can be deleted if Android defined it in
+ * system/core/include/system/graphics.h
+ */
+enum {
+   HAL_PIXEL_FORMAT_NV12_Y_TILED_INTEL = 0x100,
+   HAL_PIXEL_FORMAT_NV12 = 0x10F,
+   HAL_PIXEL_FORMAT_P010_INTEL = 0x110
+};
+
 /* The following table is used to look up a DRI image FourCC based
  * on native format and information contained in android_ycbcr struct. */
 static const struct droid_yuv_format droid_yuv_formats[] = {
    /* Native format, YCrCb, Chroma step, DRI image FourCC */
    { HAL_PIXEL_FORMAT_YCbCr_420_888,   0, 2, __DRI_IMAGE_FOURCC_NV12 },
+   { HAL_PIXEL_FORMAT_P010_INTEL,      0, 4, __DRI_IMAGE_FOURCC_P010 },
    { HAL_PIXEL_FORMAT_YCbCr_420_888,   0, 1, __DRI_IMAGE_FOURCC_YUV420 },
    { HAL_PIXEL_FORMAT_YCbCr_420_888,   1, 1, __DRI_IMAGE_FOURCC_YVU420 },
    { HAL_PIXEL_FORMAT_YV12,            1, 1, __DRI_IMAGE_FOURCC_YVU420 },
+   { HAL_PIXEL_FORMAT_NV12,            0, 2, __DRI_IMAGE_FOURCC_NV12 },
+   { HAL_PIXEL_FORMAT_NV12_Y_TILED_INTEL, 0, 2, __DRI_IMAGE_FOURCC_NV12 },
    /* HACK: See droid_create_image_from_prime_fd() and
     * https://issuetracker.google.com/32077885. */
    { HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED,   0, 2, __DRI_IMAGE_FOURCC_NV12 },
@@ -249,6 +263,51 @@ droid_window_dequeue_buffer(struct dri2_egl_surface *dri2_surf)
    return EGL_TRUE;
 }
 
+static int
+droid_resolve_format(struct dri2_egl_display *dri2_dpy,
+                     struct ANativeWindowBuffer *buf)
+{
+   int format = -1;
+   int ret;
+
+   if (buf->format != HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED)
+      return buf->format;
+#ifdef HAVE_GRALLOC1
+   if(dri2_dpy->gralloc_version == HARDWARE_MODULE_API_VERSION(1, 0)) {
+
+     if (!dri2_dpy->pfn_getFormat) {
+        _eglLog(_EGL_WARNING, "Gralloc does not support getFormat");
+        return -1;
+     }
+     ret = dri2_dpy->pfn_getFormat(dri2_dpy->gralloc1_dvc, buf->handle,
+                                       &format);
+     if (ret) {
+        _eglLog(_EGL_WARNING, "gralloc->getFormat failed: %d", ret);
+        return -1;
+     }
+   } else {
+#else
+     const gralloc_module_t *gralloc0;
+     gralloc0 = dri2_dpy->gralloc;
+
+     if (!gralloc0->perform) {
+       _eglLog(_EGL_WARNING, "gralloc->perform not supported");
+       return -1;
+     }
+     ret = gralloc0->perform(dri2_dpy->gralloc,
+                                    GRALLOC_DRM_GET_FORMAT,
+                                    buf->handle, &format);
+     if (ret){
+       _eglLog(_EGL_WARNING, "gralloc->perform failed with error: %d", ret);
+       return -1;
+     }
+#endif
+#ifdef HAVE_GRALLOC1
+   }
+#endif
+   return format;
+}
+
 static EGLBoolean
 droid_window_enqueue_buffer(_EGLDisplay *disp, struct dri2_egl_surface *dri2_surf)
 {
@@ -463,7 +522,7 @@ droid_swap_interval(_EGLDriver *drv, _EGLDisplay *dpy,
    struct dri2_egl_surface *dri2_surf = dri2_egl_surface(surf);
    struct ANativeWindow *window = dri2_surf->window;
 
-   if (window->setSwapInterval(window, interval))
+   if (window && window->setSwapInterval(window, interval))
       return EGL_FALSE;
 
    surf->SwapInterval = interval;
@@ -664,11 +723,18 @@ droid_query_buffer_age(_EGLDriver *drv,
 {
    struct dri2_egl_surface *dri2_surf = dri2_egl_surface(surface);
 
+   /* To avoid blocking other EGL calls, release the display mutex before
+    * we enter droid_window_dequeue_buffer() and re-acquire the mutex upon
+    * return.
+    */
+   mtx_unlock(&disp->Mutex);
    if (update_buffers(dri2_surf) < 0) {
       _eglError(EGL_BAD_ALLOC, "droid_query_buffer_age");
+      mtx_lock(&disp->Mutex);
       return -1;
    }
 
+   mtx_lock(&disp->Mutex);
    return dri2_surf->back ? dri2_surf->back->age : 0;
 }
 
@@ -731,6 +797,31 @@ droid_swap_buffers(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *draw)
    return EGL_TRUE;
 }
 
+static int get_ycbcr_from_flexlayout(struct android_flex_layout *outFlexLayout, struct android_ycbcr *ycbcr)
+{
+
+    for( int i = 0; i < outFlexLayout->num_planes; i++) {
+       switch(outFlexLayout->planes[i].component){
+         case FLEX_COMPONENT_Y:
+             ycbcr->y = outFlexLayout->planes[i].top_left;
+             ycbcr->ystride = outFlexLayout->planes[i].v_increment;
+         break;
+         case FLEX_COMPONENT_Cb:
+             ycbcr->cb = outFlexLayout->planes[i].top_left;
+             ycbcr->cstride = outFlexLayout->planes[i].v_increment;
+         break;
+         case FLEX_COMPONENT_Cr:
+             ycbcr->cr = outFlexLayout->planes[i].top_left;
+             ycbcr->chroma_step = outFlexLayout->planes[i].h_increment;
+         break;
+         default:
+             _eglLog(_EGL_WARNING,"unknown component 0x%x", __func__, outFlexLayout->planes[i].component);
+         break;
+       }
+  }
+  return 0;
+}
+
 #if ANDROID_API_LEVEL >= 23
 static EGLBoolean
 droid_set_damage_region(_EGLDriver *drv,
@@ -774,30 +865,70 @@ droid_create_image_from_prime_fd_yuv(_EGLDisplay *disp, _EGLContext *ctx,
 {
    struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
    struct android_ycbcr ycbcr;
+#ifdef HAVE_GRALLOC1
+   struct android_flex_layout outFlexLayout;
+   gralloc1_rect_t accessRegion;
+#endif
    size_t offsets[3];
    size_t pitches[3];
    int is_ycrcb;
    int fourcc;
    int ret;
 
-   if (!dri2_dpy->gralloc->lock_ycbcr) {
-      _eglLog(_EGL_WARNING, "Gralloc does not support lock_ycbcr");
+   int format = droid_resolve_format(dri2_dpy, buf);
+   if (format < 0) {
+      _eglError(EGL_BAD_PARAMETER, "eglCreateEGLImageKHR");
       return NULL;
    }
 
    memset(&ycbcr, 0, sizeof(ycbcr));
-   ret = dri2_dpy->gralloc->lock_ycbcr(dri2_dpy->gralloc, buf->handle,
-                                       0, 0, 0, 0, 0, &ycbcr);
-   if (ret) {
-      /* HACK: See droid_create_image_from_prime_fd() and
-       * https://issuetracker.google.com/32077885.*/
-      if (buf->format == HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED)
-         return NULL;
-
-      _eglLog(_EGL_WARNING, "gralloc->lock_ycbcr failed: %d", ret);
-      return NULL;
-   }
-   dri2_dpy->gralloc->unlock(dri2_dpy->gralloc, buf->handle);
+#ifdef HAVE_GRALLOC1
+   if(dri2_dpy->gralloc_version == HARDWARE_MODULE_API_VERSION(1, 0)) {
+     if (!dri2_dpy->pfn_lockflex) {
+        _eglLog(_EGL_WARNING, "Gralloc does not support lockflex");
+        return NULL;
+     }
+
+     ret = dri2_dpy->pfn_lockflex(dri2_dpy->gralloc1_dvc, buf->handle,
+                                       0, 0, &accessRegion, &outFlexLayout, -1);
+     if (ret) {
+        _eglLog(_EGL_WARNING, "gralloc->lockflex failed: %d", ret);
+        return NULL;
+     }
+     ret = get_ycbcr_from_flexlayout(&outFlexLayout, &ycbcr);
+     if (ret) {
+        _eglLog(_EGL_WARNING, "gralloc->lockflex failed: %d", ret);
+        return NULL;
+     }
+     int outReleaseFence = 0;
+     dri2_dpy->pfn_unlock(dri2_dpy->gralloc1_dvc, buf->handle, &outReleaseFence);
+   } else {
+#endif
+     const gralloc_module_t *gralloc0;
+     gralloc0 = dri2_dpy->gralloc;
+
+     if (!gralloc0->lock_ycbcr) {
+        _eglLog(_EGL_WARNING, "Gralloc does not support lock_ycbcr");
+        return NULL;
+     }
+
+     ret = gralloc0->lock_ycbcr(gralloc0, buf->handle,
+                                        0, 0, 0, 0, 0, &ycbcr);
+
+     if (ret) {
+        /* HACK: See droid_create_image_from_prime_fd() and
+         * https://issuetracker.google.com/32077885.*/
+        if (buf->format == HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED)
+           return NULL;
+
+        _eglLog(_EGL_WARNING, "gralloc->lock_ycbcr failed: %d", ret);
+        return NULL;
+     }
+
+     gralloc0->unlock(dri2_dpy->gralloc, buf->handle);
+#ifdef HAVE_GRALLOC1
+  }
+#endif
 
    /* When lock_ycbcr's usage argument contains no SW_READ/WRITE flags
     * it will return the .y/.cb/.cr pointers based on a NULL pointer,
@@ -822,14 +953,15 @@ droid_create_image_from_prime_fd_yuv(_EGLDisplay *disp, _EGLContext *ctx,
 
    /* .chroma_step is the byte distance between the same chroma channel
     * values of subsequent pixels, assumed to be the same for Cb and Cr. */
-   fourcc = get_fourcc_yuv(buf->format, is_ycrcb, ycbcr.chroma_step);
+   fourcc = get_fourcc_yuv(format, is_ycrcb, ycbcr.chroma_step);
    if (fourcc == -1) {
       _eglLog(_EGL_WARNING, "unsupported YUV format, native = %x, is_ycrcb = %d, chroma_step = %d",
-              buf->format, is_ycrcb, ycbcr.chroma_step);
+              format, is_ycrcb, ycbcr.chroma_step);
       return NULL;
    }
 
-   if (ycbcr.chroma_step == 2) {
+   /* FIXME? we should not rely on chroma_step */
+   if (ycbcr.chroma_step == 2 || ycbcr.chroma_step == 4) {
       /* Semi-planar Y + CbCr or Y + CrCb format. */
       const EGLint attr_list_2plane[] = {
          EGL_WIDTH, buf->width,
@@ -871,9 +1003,16 @@ static _EGLImage *
 droid_create_image_from_prime_fd(_EGLDisplay *disp, _EGLContext *ctx,
                                  struct ANativeWindowBuffer *buf, int fd)
 {
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
    unsigned int pitch;
 
-   if (is_yuv(buf->format)) {
+   int format = droid_resolve_format(dri2_dpy, buf);
+   if (format < 0) {
+      _eglLog(_EGL_WARNING, "Could not resolve buffer format");
+      return NULL;
+   }
+
+   if (is_yuv(format)) {
       _EGLImage *image;
 
       image = droid_create_image_from_prime_fd_yuv(disp, ctx, buf, fd);
@@ -888,13 +1027,13 @@ droid_create_image_from_prime_fd(_EGLDisplay *disp, _EGLContext *ctx,
          return image;
    }
 
-   const int fourcc = get_fourcc(buf->format);
+   const int fourcc = get_fourcc(format);
    if (fourcc == -1) {
       _eglError(EGL_BAD_PARAMETER, "eglCreateEGLImageKHR");
       return NULL;
    }
 
-   pitch = buf->stride * get_format_bpp(buf->format);
+   pitch = buf->stride * get_format_bpp(format);
    if (pitch == 0) {
       _eglError(EGL_BAD_PARAMETER, "eglCreateEGLImageKHR");
       return NULL;
@@ -1530,6 +1669,7 @@ dri2_initialize_android(_EGLDriver *drv, _EGLDisplay *disp)
    _EGLDevice *dev;
    struct dri2_egl_display *dri2_dpy;
    const char *err;
+   hw_device_t *device;
    int ret;
 
    /* Not supported yet */
@@ -1547,6 +1687,27 @@ dri2_initialize_android(_EGLDriver *drv, _EGLDisplay *disp)
       err = "DRI2: failed to get gralloc module";
       goto cleanup;
    }
+   dri2_dpy->gralloc_version = dri2_dpy->gralloc->module_api_version;
+#ifdef HAVE_GRALLOC1
+   if (dri2_dpy->gralloc_version == HARDWARE_MODULE_API_VERSION(1, 0)) {
+      ret = dri2_dpy->gralloc->methods->open(dri2_dpy->gralloc, GRALLOC_HARDWARE_MODULE_ID, &device);
+      if (ret) {
+        err = "Failed to open hw_device device";
+        goto cleanup;
+      } else {
+        dri2_dpy->gralloc1_dvc = (gralloc1_device_t *)device;
+
+        dri2_dpy->pfn_lockflex = (GRALLOC1_PFN_LOCK_FLEX)\
+             dri2_dpy->gralloc1_dvc->getFunction(dri2_dpy->gralloc1_dvc, GRALLOC1_FUNCTION_LOCK_FLEX);
+
+        dri2_dpy->pfn_getFormat = (GRALLOC1_PFN_GET_FORMAT)\
+             dri2_dpy->gralloc1_dvc->getFunction(dri2_dpy->gralloc1_dvc, GRALLOC1_FUNCTION_GET_FORMAT);
+
+        dri2_dpy->pfn_unlock = (GRALLOC1_PFN_UNLOCK)\
+             dri2_dpy->gralloc1_dvc->getFunction(dri2_dpy->gralloc1_dvc, GRALLOC1_FUNCTION_UNLOCK);
+      }
+   }
+#endif
 
    disp->DriverData = (void *) dri2_dpy;
 
diff --git a/src/egl/drivers/dri2/platform_wayland.c b/src/egl/drivers/dri2/platform_wayland.c
index c3ca1b6f7bc..3025e34ba63 100644
--- a/src/egl/drivers/dri2/platform_wayland.c
+++ b/src/egl/drivers/dri2/platform_wayland.c
@@ -637,10 +637,8 @@ update_buffers(struct dri2_egl_surface *dri2_surf)
    struct dri2_egl_display *dri2_dpy =
       dri2_egl_display(dri2_surf->base.Resource.Display);
 
-   if (dri2_surf->base.Width != dri2_surf->wl_win->attached_width ||
-       dri2_surf->base.Height != dri2_surf->wl_win->attached_height) {
-
-      dri2_wl_release_buffers(dri2_surf);
+   if (dri2_surf->base.Width != dri2_surf->wl_win->width ||
+       dri2_surf->base.Height != dri2_surf->wl_win->height) {
 
       dri2_surf->base.Width  = dri2_surf->wl_win->width;
       dri2_surf->base.Height = dri2_surf->wl_win->height;
@@ -648,6 +646,11 @@ update_buffers(struct dri2_egl_surface *dri2_surf)
       dri2_surf->dy = dri2_surf->wl_win->dy;
    }
 
+   if (dri2_surf->base.Width != dri2_surf->wl_win->attached_width ||
+       dri2_surf->base.Height != dri2_surf->wl_win->attached_height) {
+      dri2_wl_release_buffers(dri2_surf);
+   }
+
    if (get_back_bo(dri2_surf) < 0) {
       _eglError(EGL_BAD_ALLOC, "failed to allocate color buffer");
       return -1;
diff --git a/src/egl/main/eglcontext.c b/src/egl/main/eglcontext.c
index 6b241a524ec..318a85a4f88 100644
--- a/src/egl/main/eglcontext.c
+++ b/src/egl/main/eglcontext.c
@@ -178,9 +178,12 @@ _eglParseContextAttribList(_EGLContext *ctx, _EGLDisplay *dpy,
           *     is supported for OpenGL contexts, and requesting a
           *     forward-compatible context for OpenGL versions less than 3.0
           *     will generate an error."
+          *
+          * Note: since the forward-compatible flag can be set more than one way,
+          *       the OpenGL version check is performed once, below.
           */
          if ((val & EGL_CONTEXT_OPENGL_FORWARD_COMPATIBLE_BIT_KHR) &&
-             (api != EGL_OPENGL_API || ctx->ClientMajorVersion < 3)) {
+              api != EGL_OPENGL_API) {
             err = EGL_BAD_ATTRIBUTE;
             break;
          }
diff --git a/src/egl/main/eglcurrent.c b/src/egl/main/eglcurrent.c
index 479f231fb8f..d20ec64e654 100644
--- a/src/egl/main/eglcurrent.c
+++ b/src/egl/main/eglcurrent.c
@@ -137,13 +137,37 @@ _eglDestroyThreadInfo(_EGLThreadInfo *t)
 }
 
 
+/**
+ * Delete/free a _EGLThreadInfo object.
+ */
+static void
+_eglDestroyThreadInfoCallback(_EGLThreadInfo *t)
+{
+   /* If this callback is called on thread termination then try to also give a
+    * chance to cleanup to the client drivers. If called for module termination
+    * then just release the thread information as calling eglReleaseThread
+    * would result in a deadlock.
+    */
+   if (_egl_TSDInitialized) {
+      /* The callback handler has replaced the TLS entry, which is passed in as
+       * 't', with NULL. Restore it here so that the release thread finds it in
+       * the TLS entry.
+       */
+      _eglSetTSD(t);
+      eglReleaseThread();
+   } else {
+      _eglDestroyThreadInfo(t);
+   }
+}
+
+
 /**
  * Make sure TSD is initialized and return current value.
  */
 static inline _EGLThreadInfo *
 _eglCheckedGetTSD(void)
 {
-   if (_eglInitTSD(&_eglDestroyThreadInfo) != EGL_TRUE) {
+   if (_eglInitTSD(&_eglDestroyThreadInfoCallback) != EGL_TRUE) {
       _eglLog(_EGL_FATAL, "failed to initialize \"current\" system");
       return NULL;
    }
diff --git a/src/egl/main/egldevice.c b/src/egl/main/egldevice.c
index 4878039be0e..c5c9a21273a 100644
--- a/src/egl/main/egldevice.c
+++ b/src/egl/main/egldevice.c
@@ -202,18 +202,6 @@ _eglDeviceSupports(_EGLDevice *dev, _EGLDeviceExtension ext)
    };
 }
 
-/* Ideally we'll have an extension which passes the render node,
- * instead of the card one + magic.
- *
- * Then we can move this in _eglQueryDeviceStringEXT below. Until then
- * keep it separate.
- */
-const char *
-_eglGetDRMDeviceRenderNode(_EGLDevice *dev)
-{
-   return dev->device->nodes[DRM_NODE_RENDER];
-}
-
 EGLBoolean
 _eglQueryDeviceAttribEXT(_EGLDevice *dev, EGLint attribute,
                          EGLAttrib *value)
diff --git a/src/egl/main/egldevice.h b/src/egl/main/egldevice.h
index 83a47d5eacc..883f96f8e30 100644
--- a/src/egl/main/egldevice.h
+++ b/src/egl/main/egldevice.h
@@ -68,9 +68,6 @@ typedef enum _egl_device_extension _EGLDeviceExtension;
 EGLBoolean
 _eglDeviceSupports(_EGLDevice *dev, _EGLDeviceExtension ext);
 
-const char *
-_eglGetDRMDeviceRenderNode(_EGLDevice *dev);
-
 EGLBoolean
 _eglQueryDeviceAttribEXT(_EGLDevice *dev, EGLint attribute,
                          EGLAttrib *value);
diff --git a/src/egl/meson.build b/src/egl/meson.build
index a23cc36fc2b..b7ff09e9fed 100644
--- a/src/egl/meson.build
+++ b/src/egl/meson.build
@@ -93,10 +93,11 @@ if with_dri2
     'drivers/dri2/egl_dri2.h',
     'drivers/dri2/egl_dri2_fallbacks.h',
   )
+  link_for_egl += [libloader, libxmlconfig]
+  incs_for_egl += inc_loader
 
   if with_platform_x11
     files_egl += files('drivers/dri2/platform_x11.c')
-    incs_for_egl += inc_loader
     if with_dri3
       files_egl += files('drivers/dri2/platform_x11_dri3.c')
       link_for_egl += libloader_dri3_helper
@@ -105,13 +106,12 @@ if with_dri2
   endif
   if with_platform_drm
     files_egl += files('drivers/dri2/platform_drm.c')
-    link_for_egl += [libloader, libgbm, libxmlconfig]
-    incs_for_egl += [inc_loader, inc_gbm, include_directories('../gbm/main')]
+    link_for_egl += libgbm
+    incs_for_egl += [inc_gbm, include_directories('../gbm/main')]
     deps_for_egl += dep_libdrm
   endif
   if with_platform_surfaceless
     files_egl += files('drivers/dri2/platform_surfaceless.c')
-    incs_for_egl += [inc_loader]
   endif
   if with_platform_wayland
     deps_for_egl += [dep_wayland_client, dep_wayland_server, dep_wayland_egl_headers]
@@ -127,7 +127,6 @@ if with_dri2
   if with_platform_android
     deps_for_egl += dep_android
     files_egl += files('drivers/dri2/platform_android.c')
-    incs_for_egl += [inc_loader]
   endif
 elif with_platform_haiku
   incs_for_egl += inc_haikugl
@@ -166,7 +165,7 @@ libegl = shared_library(
     '-D_EGL_NATIVE_PLATFORM=_EGL_PLATFORM_@0@'.format(egl_native_platform.to_upper()),
   ],
   include_directories : incs_for_egl,
-  link_with : [link_for_egl, libloader, libxmlconfig, libglapi, libmesa_util],
+  link_with : [link_for_egl, libglapi, libmesa_util],
   link_args : [ld_args_bsymbolic, ld_args_gc_sections],
   dependencies : [deps_for_egl, dep_dl, dep_libdrm, dep_clock, dep_thread],
   install : true,
diff --git a/src/freedreno/Makefile.am b/src/freedreno/Makefile.am
index 460fb87fb46..342f03d644c 100644
--- a/src/freedreno/Makefile.am
+++ b/src/freedreno/Makefile.am
@@ -45,6 +45,7 @@ TESTS =
 BUILT_SOURCES =
 CLEANFILES =
 EXTRA_DIST = \
+	meson.build \
 	drm/meson.build \
 	ir3/ir3_nir_trig.py \
 	ir3/meson.build
diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c
index d9fcf798b3d..68926c9553b 100644
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -97,7 +97,7 @@ ir3_optimize_loop(nir_shader *s)
 			progress |= OPT(s, nir_opt_gcm, true);
 		else if (gcm == 2)
 			progress |= OPT(s, nir_opt_gcm, false);
-		progress |= OPT(s, nir_opt_peephole_select, 16, true, true);
+		progress |= OPT(s, nir_opt_peephole_select, 16, true);
 		progress |= OPT(s, nir_opt_intrinsics);
 		progress |= OPT(s, nir_opt_algebraic);
 		progress |= OPT(s, nir_opt_constant_folding);
diff --git a/src/gallium/auxiliary/Android.mk b/src/gallium/auxiliary/Android.mk
index 7618c6fcd93..fe976501451 100644
--- a/src/gallium/auxiliary/Android.mk
+++ b/src/gallium/auxiliary/Android.mk
@@ -32,8 +32,11 @@ LOCAL_SRC_FILES := \
 	$(C_SOURCES) \
 	$(NIR_SOURCES) \
 	$(RENDERONLY_SOURCES) \
-	$(VL_STUB_SOURCES) \
-	util/u_debug_stack_android.cpp
+	$(VL_STUB_SOURCES)
+
+ifeq ($(USE_LIBBACKTRACE),true)
+	LOCAL_SRC_FILES += util/u_debug_stack_android.cpp
+endif
 
 LOCAL_C_INCLUDES := \
 	$(GALLIUM_TOP)/auxiliary/util \
diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
index 3fc096789c0..f8c69585e6a 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -950,6 +950,8 @@ draw_set_mapped_so_targets(struct draw_context *draw,
 {
    int i;
 
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+
    for (i = 0; i < num_targets; i++)
       draw->so.targets[i] = targets[i];
    for (i = num_targets; i < PIPE_MAX_SO_BUFFERS; i++)
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
index fcbdd5050fe..f307c26d4f7 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
@@ -556,11 +556,11 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
 
    llvm::SmallVector<std::string, 16> MAttrs;
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-#if HAVE_LLVM >= 0x0400
-   /* llvm-3.7+ implements sys::getHostCPUFeatures for x86,
-    * which allows us to enable/disable code generation based
-    * on the results of cpuid.
+#if HAVE_LLVM >= 0x0400 && (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) || defined(PIPE_ARCH_ARM))
+   /* llvm-3.3+ implements sys::getHostCPUFeatures for Arm
+    * and llvm-3.7+ for x86, which allows us to enable/disable
+    * code generation based on the results of cpuid on these
+    * architectures.
     */
    llvm::StringMap<bool> features;
    llvm::sys::getHostCPUFeatures(features);
@@ -570,7 +570,7 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
         ++f) {
       MAttrs.push_back(((*f).second ? "+" : "-") + (*f).first().str());
    }
-#else
+#elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
    /*
     * We need to unset attributes because sometimes LLVM mistakenly assumes
     * certain features are present given the processor name.
@@ -625,6 +625,12 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
    MAttrs.push_back("-avx512vl");
 #endif
 #endif
+#if defined(PIPE_ARCH_ARM)
+   if (!util_cpu_caps.has_neon) {
+      MAttrs.push_back("-neon");
+      MAttrs.push_back("-crypto");
+      MAttrs.push_back("-vfp2");
+   }
 #endif
 
 #if defined(PIPE_ARCH_PPC)
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index 0f5b3d9acb7..d6af1d84471 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -1108,7 +1108,7 @@ get_indirect_index(struct lp_build_tgsi_soa_context *bld,
     * larger than the declared size but smaller than the buffer size.
     */
    if (reg_file != TGSI_FILE_CONSTANT) {
-      assert(index_limit > 0);
+      assert(index_limit >= 0);
       max_index = lp_build_const_int_vec(bld->bld_base.base.gallivm,
                                          uint_bld->type, index_limit);
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h
index 64f2598a259..09eac4da95a 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h
@@ -220,7 +220,9 @@ tgsi_is_bindless_image_file(unsigned file)
 {
    return file != TGSI_FILE_IMAGE &&
           file != TGSI_FILE_MEMORY &&
-          file != TGSI_FILE_BUFFER;
+          file != TGSI_FILE_BUFFER &&
+          file != TGSI_FILE_CONSTBUF &&
+          file != TGSI_FILE_HW_ATOMIC;
 }
 
 #ifdef __cplusplus
diff --git a/src/gallium/auxiliary/util/u_threaded_context.c b/src/gallium/auxiliary/util/u_threaded_context.c
index 8e3bceae18d..b596c322918 100644
--- a/src/gallium/auxiliary/util/u_threaded_context.c
+++ b/src/gallium/auxiliary/util/u_threaded_context.c
@@ -1524,7 +1524,8 @@ tc_buffer_do_flush_region(struct threaded_context *tc,
    if (ttrans->staging) {
       struct pipe_box src_box;
 
-      u_box_1d(ttrans->offset + box->x % tc->map_buffer_alignment,
+      u_box_1d(ttrans->offset + ttrans->b.box.x % tc->map_buffer_alignment +
+               (box->x - ttrans->b.box.x),
                box->width, &src_box);
 
       /* Copy the staging buffer into the original one. */
diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst
index eaf492ce8b0..b927d014179 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -487,6 +487,10 @@ The integer capabilities:
 * ``PIPE_CAP_DEST_SURFACE_SRGB_CONTROL``: Indicates whether the drivers
   supports switching the format between sRGB and linear for a surface that is
   used as destination in draw and blit calls.
+* ``PIPE_CAP_MAX_VARYINGS``: The maximum number of fragment shader
+  varyings. This will generally correspond to
+  ``PIPE_SHADER_CAP_MAX_INPUTS`` for the fragment shader, but in some
+  cases may be a smaller number.
 
 .. _pipe_capf:
 
diff --git a/src/gallium/drivers/etnaviv/etnaviv_screen.c b/src/gallium/drivers/etnaviv/etnaviv_screen.c
index fd320232528..35dcac1409b 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_screen.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_screen.c
@@ -360,6 +360,9 @@ etna_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
       return 0;
 
+   case PIPE_CAP_MAX_VARYINGS:
+      return screen->specs.max_varyings;
+
    case PIPE_CAP_PCI_GROUP:
    case PIPE_CAP_PCI_BUS:
    case PIPE_CAP_PCI_DEVICE:
diff --git a/src/gallium/drivers/freedreno/Makefile.am b/src/gallium/drivers/freedreno/Makefile.am
index fe409fa5f52..dbc15f40389 100644
--- a/src/gallium/drivers/freedreno/Makefile.am
+++ b/src/gallium/drivers/freedreno/Makefile.am
@@ -23,4 +23,6 @@ libfreedreno_la_SOURCES = \
 	$(a6xx_SOURCES) \
 	$(ir3_SOURCES)
 
-EXTRA_DIST = meson.build
+EXTRA_DIST = \
+	ir3/ir3_cmdline.c \
+	meson.build
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c
index 772127c7478..498c1eae1d7 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c
@@ -339,7 +339,6 @@ clear_fast(struct fd_batch *batch, struct fd_ringbuffer *ring,
 	OUT_PKT3(ring, CP_SET_CONSTANT, 2);
 	OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_SCREEN_SCISSOR_BR));
 	OUT_RINGP(ring, patch_type, &batch->gmem_patches);
-	OUT_RING(ring, 0);
 
 	OUT_PKT3(ring, CP_SET_CONSTANT, 4);
 	OUT_RING(ring, CP_REG(REG_A2XX_RB_SURFACE_INFO));
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c b/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c
index 1c073e31739..692188ebd4e 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c
@@ -59,6 +59,28 @@ static uint32_t fmt2swap(enum pipe_format format)
 	}
 }
 
+static bool
+use_hw_binning(struct fd_batch *batch)
+{
+	struct fd_gmem_stateobj *gmem = &batch->ctx->gmem;
+
+	/* we hardcoded a limit of 8 "pipes", we can increase this limit
+	 * at the cost of a slightly larger command stream
+	 * however very few cases will need more than 8
+	 * gmem->num_vsc_pipes == 0 means empty batch (TODO: does it still happen?)
+	 */
+	if (gmem->num_vsc_pipes > 8 || !gmem->num_vsc_pipes)
+		return false;
+
+	/* only a20x hw binning is implement
+	 * a22x is more like a3xx, but perhaps the a20x works? (TODO)
+	 */
+	if (!is_a20x(batch->ctx->screen))
+		return false;
+
+	return fd_binning_enabled && ((gmem->nbins_x * gmem->nbins_y) > 2);
+}
+
 /* transfer from gmem to system memory (ie. normal RAM) */
 
 static void
@@ -272,7 +294,7 @@ fd2_emit_tile_mem2gmem(struct fd_batch *batch, struct fd_tile *tile)
 	x1 = ((float)tile->xoff + bin_w) / ((float)pfb->width);
 	y0 = ((float)tile->yoff) / ((float)pfb->height);
 	y1 = ((float)tile->yoff + bin_h) / ((float)pfb->height);
-	OUT_PKT3(ring, CP_MEM_WRITE, 9);
+	OUT_PKT3(ring, CP_MEM_WRITE, 7);
 	OUT_RELOC(ring, fd_resource(fd2_ctx->solid_vertexbuf)->bo, 36, 0, 0);
 	OUT_RING(ring, fui(x0));
 	OUT_RING(ring, fui(y0));
@@ -280,8 +302,6 @@ fd2_emit_tile_mem2gmem(struct fd_batch *batch, struct fd_tile *tile)
 	OUT_RING(ring, fui(y0));
 	OUT_RING(ring, fui(x0));
 	OUT_RING(ring, fui(y1));
-	OUT_RING(ring, fui(x1));
-	OUT_RING(ring, fui(y1));
 
 	OUT_PKT3(ring, CP_SET_CONSTANT, 2);
 	OUT_RING(ring, CP_REG(REG_A2XX_VGT_INDX_OFFSET));
@@ -492,18 +512,18 @@ fd2_emit_tile_init(struct fd_batch *batch)
 		/* note: 1 "line" is 512 bytes in both color/depth areas (1K total) */
 		switch (patch->val) {
 		case GMEM_PATCH_FASTCLEAR_COLOR:
-			size = align(gmem->bin_w * gmem->bin_h * color_size, 0x4000);
+			size = align(gmem->bin_w * gmem->bin_h * color_size, 0x8000);
 			lines = size / 1024;
 			depth_base = size / 2;
 			break;
 		case GMEM_PATCH_FASTCLEAR_DEPTH:
-			size = align(gmem->bin_w * gmem->bin_h * depth_size, 0x4000);
+			size = align(gmem->bin_w * gmem->bin_h * depth_size, 0x8000);
 			lines = size / 1024;
 			color_base = depth_base;
 			depth_base = depth_base + size / 2;
 			break;
 		case GMEM_PATCH_FASTCLEAR_COLOR_DEPTH:
-			lines = align(gmem->bin_w * gmem->bin_h * color_size * 2, 0x4000) / 1024;
+			lines = align(gmem->bin_w * gmem->bin_h * color_size * 2, 0x8000) / 1024;
 			break;
 		case GMEM_PATCH_RESTORE_INFO:
 			patch->cs[0] = gmem->bin_w;
@@ -535,7 +555,7 @@ fd2_emit_tile_init(struct fd_batch *batch)
 	OUT_RING(ring, CP_REG(REG_A2XX_VGT_CURRENT_BIN_ID_MAX));
 	OUT_RING(ring, 0);
 
-	if (is_a20x(ctx->screen) && fd_binning_enabled && gmem->num_vsc_pipes) {
+	if (use_hw_binning(batch)) {
 		/* patch out unneeded memory exports by changing EXEC CF to EXEC_END
 		 *
 		 * in the shader compiler, we guarantee that the shader ends with
@@ -694,7 +714,7 @@ fd2_emit_tile_renderprep(struct fd_batch *batch, struct fd_tile *tile)
 		OUT_RING(ring, fui(0.0f));
 	}
 
-	if (is_a20x(ctx->screen) && fd_binning_enabled) {
+	if (use_hw_binning(batch)) {
 		struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[tile->p];
 
 		OUT_PKT3(ring, CP_SET_CONSTANT, 2);
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_resource.c b/src/gallium/drivers/freedreno/a2xx/fd2_resource.c
index 1bd1f103ccd..2c813804689 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_resource.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_resource.c
@@ -55,6 +55,12 @@ fd2_setup_slices(struct fd_resource *rsc)
 			break;
 		}
 
+		/* mipmaps have power of two sizes in memory */
+		if (level) {
+			width = util_next_power_of_two(width);
+			height = util_next_power_of_two(height);
+		}
+
 		slice->pitch = width;
 		slice->offset = size;
 
diff --git a/src/gallium/drivers/freedreno/a2xx/ir2_nir.c b/src/gallium/drivers/freedreno/a2xx/ir2_nir.c
index 5d92f86befc..b206911270a 100644
--- a/src/gallium/drivers/freedreno/a2xx/ir2_nir.c
+++ b/src/gallium/drivers/freedreno/a2xx/ir2_nir.c
@@ -74,7 +74,7 @@ ir2_optimize_loop(nir_shader *s)
 		progress |= OPT(s, nir_opt_dce);
 		progress |= OPT(s, nir_opt_cse);
 		/* progress |= OPT(s, nir_opt_gcm, true); */
-		progress |= OPT(s, nir_opt_peephole_select, UINT_MAX, true, true);
+		progress |= OPT(s, nir_opt_peephole_select, UINT_MAX, true);
 		progress |= OPT(s, nir_opt_intrinsics);
 		progress |= OPT(s, nir_opt_algebraic);
 		progress |= OPT(s, nir_opt_constant_folding);
diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_blitter.c b/src/gallium/drivers/freedreno/a6xx/fd6_blitter.c
index 460255f748a..c8719636182 100644
--- a/src/gallium/drivers/freedreno/a6xx/fd6_blitter.c
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_blitter.c
@@ -438,7 +438,7 @@ emit_blit_texture(struct fd_ringbuffer *ring, const struct pipe_blit_info *info)
 		OUT_RING(ring, A6XX_RB_2D_DST_INFO_COLOR_FORMAT(dfmt) |
 				 A6XX_RB_2D_DST_INFO_TILE_MODE(dtile) |
 				 A6XX_RB_2D_DST_INFO_COLOR_SWAP(dswap));
-		OUT_RELOC(ring, dst->bo, doff, 0, 0);    /* RB_2D_DST_LO/HI */
+		OUT_RELOCW(ring, dst->bo, doff, 0, 0);    /* RB_2D_DST_LO/HI */
 		OUT_RING(ring, A6XX_RB_2D_DST_SIZE_PITCH(dpitch));
 		OUT_RING(ring, 0x00000000);
 		OUT_RING(ring, 0x00000000);
diff --git a/src/gallium/drivers/freedreno/freedreno_resource.c b/src/gallium/drivers/freedreno/freedreno_resource.c
index ce8e4480be1..1879d2c60ed 100644
--- a/src/gallium/drivers/freedreno/freedreno_resource.c
+++ b/src/gallium/drivers/freedreno/freedreno_resource.c
@@ -853,7 +853,13 @@ fd_resource_create_with_modifiers(struct pipe_screen *pscreen,
 	enum pipe_format format = tmpl->format;
 	uint32_t size;
 
-	if (screen->ro && (tmpl->bind & PIPE_BIND_SCANOUT)) {
+	/* when using kmsro, scanout buffers are allocated on the display device
+	 * create_with_modifiers() doesn't give us usage flags, so we have to
+	 * assume that all calls with modifiers are scanout-possible
+	 */
+	if (screen->ro &&
+		((tmpl->bind & PIPE_BIND_SCANOUT) ||
+		 !(count == 1 && modifiers[0] == DRM_FORMAT_MOD_INVALID))) {
 		struct pipe_resource scanout_templat = *tmpl;
 		struct renderonly_scanout *scanout;
 		struct winsys_handle handle;
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index e596a4e8462..c3b08ab0e0f 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -317,6 +317,9 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_MAX_VIEWPORTS:
 		return 1;
 
+	case PIPE_CAP_MAX_VARYINGS:
+		return 16;
+
 	case PIPE_CAP_SHAREABLE_SHADERS:
 	case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY:
 	/* manage the variants for these ourself, to avoid breaking precompile: */
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
index a7b4a43c015..78707c66e62 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -402,6 +402,8 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
       return 0;
    case PIPE_CAP_ENDIANNESS:
       return PIPE_ENDIAN_LITTLE;
+   case PIPE_CAP_MAX_VARYINGS:
+      return 10;
 
    case PIPE_CAP_VENDOR_ID:
       return 0x8086;
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index c95016a6cbe..b55b4a3c4fe 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -310,6 +310,8 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
       return 1;
    case PIPE_CAP_CLEAR_TEXTURE:
       return 1;
+   case PIPE_CAP_MAX_VARYINGS:
+      return 32;
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
diff --git a/src/gallium/drivers/nouveau/codegen/lib/gk104.asm b/src/gallium/drivers/nouveau/codegen/lib/gk104.asm
index cd65b547279..576da1bab60 100644
--- a/src/gallium/drivers/nouveau/codegen/lib/gk104.asm
+++ b/src/gallium/drivers/nouveau/codegen/lib/gk104.asm
@@ -543,6 +543,8 @@ $p2 suldgb b32 $r3 cg zero u8 g[$r4d] $r2 $p0
 $p1 suldgb b32 $r3 cv zero u8 g[$r4d] $r2 $p0
 long mov b32 $r3 0x3f800000
 long nop
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+long nop
 long ret
 
 
@@ -554,7 +556,144 @@ long ret
 // SIZE:    9 * 8 bytes
 //
 gk104_rcp_f64:
-   long nop
+   // Step 1: classify input according to exponent and value, and calculate
+   // result for 0/inf/nan. $r2 holds the exponent value, which starts at
+   // bit 52 (bit 20 of the upper half) and is 11 bits in length
+   ext u32 $r2 $r1 0xb14
+   add b32 $r3 $r2 0xffffffff
+   joinat #rcp_rejoin
+   // We want to check whether the exponent is 0 or 0x7ff (i.e. NaN, inf,
+   // denorm, or 0). Do this by substracting 1 from the exponent, which will
+   // mean that it's > 0x7fd in those cases when doing unsigned comparison
+   set $p0 0x1 gt u32 $r3 0x7fd
+   // $r3: 0 for norms, 0x36 for denorms, -1 for others
+   long mov b32 $r3 0x0
+   sched 0x2f 0x04 0x2d 0x2b 0x2f 0x28 0x28
+   join (not $p0) nop
+   // Process all special values: NaN, inf, denorm, 0
+   mov b32 $r3 0xffffffff
+   // A number is NaN if its abs value is greater than or unordered with inf
+   set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000
+   (not $p0) bra #rcp_inf_or_denorm_or_zero
+   // NaN -> NaN, the next line sets the "quiet" bit of the result. This
+   // behavior is both seen on the CPU and the blob
+   join or b32 $r1 $r1 0x80000
+rcp_inf_or_denorm_or_zero:
+   and b32 $r4 $r1 0x7ff00000
+   // Other values with nonzero in exponent field should be inf
+   set $p0 0x1 eq s32 $r4 0x0
+   sched 0x2b 0x04 0x2f 0x2d 0x2b 0x2f 0x20
+   $p0 bra #rcp_denorm_or_zero
+   // +/-Inf -> +/-0
+   xor b32 $r1 $r1 0x7ff00000
+   join mov b32 $r0 0x0
+rcp_denorm_or_zero:
+   set $p0 0x1 gtu f64 abs $r0d 0x0
+   $p0 bra #rcp_denorm
+   // +/-0 -> +/-Inf
+   join or b32 $r1 $r1 0x7ff00000
+rcp_denorm:
+   // non-0 denorms: multiply with 2^54 (the 0x36 in $r3), join with norms
+   mul rn f64 $r0d $r0d 0x4350000000000000
+   sched 0x2f 0x28 0x2b 0x28 0x28 0x04 0x28
+   join mov b32 $r3 0x36
+rcp_rejoin:
+   // All numbers with -1 in $r3 have their result ready in $r0d, return them
+   // others need further calculation
+   set $p0 0x1 lt s32 $r3 0x0
+   $p0 bra #rcp_end
+   // Step 2: Before the real calculation goes on, renormalize the values to
+   // range [1, 2) by setting exponent field to 0x3ff (the exponent of 1)
+   // result in $r6d. The exponent will be recovered later.
+   ext u32 $r2 $r1 0xb14
+   and b32 $r7 $r1 0x800fffff
+   add b32 $r7 $r7 0x3ff00000
+   long mov b32 $r6 $r0
+   sched 0x2b 0x04 0x28 0x28 0x2a 0x2b 0x2e
+   // Step 3: Convert new value to float (no overflow will occur due to step
+   // 2), calculate rcp and do newton-raphson step once
+   cvt rz f32 $r5 f64 $r6d
+   long rcp f32 $r4 $r5
+   mov b32 $r0 0xbf800000
+   fma rn f32 $r5 $r4 $r5 $r0
+   fma rn f32 $r0 neg $r4 $r5 $r4
+   // Step 4: convert result $r0 back to double, do newton-raphson steps
+   cvt f64 $r0d f32 $r0
+   cvt f64 $r6d neg f64 $r6d
+   sched 0x2e 0x29 0x29 0x29 0x29 0x29 0x29
+   cvt f64 $r8d f32 0x3f800000
+   // 4 Newton-Raphson Steps, tmp in $r4d, result in $r0d
+   // The formula used here (and above) is:
+   //     RCP_{n + 1} = 2 * RCP_{n} - x * RCP_{n} * RCP_{n}
+   // The following code uses 2 FMAs for each step, and it will basically
+   // looks like:
+   //     tmp = -src * RCP_{n} + 1
+   //     RCP_{n + 1} = RCP_{n} * tmp + RCP_{n}
+   fma rn f64 $r4d $r6d $r0d $r8d
+   fma rn f64 $r0d $r0d $r4d $r0d
+   fma rn f64 $r4d $r6d $r0d $r8d
+   fma rn f64 $r0d $r0d $r4d $r0d
+   fma rn f64 $r4d $r6d $r0d $r8d
+   fma rn f64 $r0d $r0d $r4d $r0d
+   sched 0x29 0x20 0x28 0x28 0x28 0x28 0x28
+   fma rn f64 $r4d $r6d $r0d $r8d
+   fma rn f64 $r0d $r0d $r4d $r0d
+   // Step 5: Exponent recovery and final processing
+   // The exponent is recovered by adding what we added to the exponent.
+   // Suppose we want to calculate rcp(x), but we have rcp(cx), then
+   //     rcp(x) = c * rcp(cx)
+   // The delta in exponent comes from two sources:
+   //   1) The renormalization in step 2. The delta is:
+   //      0x3ff - $r2
+   //   2) (For the denorm input) The 2^54 we multiplied at rcp_denorm, stored
+   //      in $r3
+   // These 2 sources are calculated in the first two lines below, and then
+   // added to the exponent extracted from the result above.
+   // Note that after processing, the new exponent may >= 0x7ff (inf)
+   // or <= 0 (denorm). Those cases will be handled respectively below
+   subr b32 $r2 $r2 0x3ff
+   long add b32 $r4 $r2 $r3
+   ext u32 $r3 $r1 0xb14
+   // New exponent in $r3
+   long add b32 $r3 $r3 $r4
+   add b32 $r2 $r3 0xffffffff
+   sched 0x28 0x2b 0x28 0x2b 0x28 0x28 0x2b
+   // (exponent-1) < 0x7fe (unsigned) means the result is in norm range
+   // (same logic as in step 1)
+   set $p0 0x1 lt u32 $r2 0x7fe
+   (not $p0) bra #rcp_result_inf_or_denorm
+   // Norms: convert exponents back and return
+   shl b32 $r4 $r4 clamp 0x14
+   long add b32 $r1 $r4 $r1
+   bra #rcp_end
+rcp_result_inf_or_denorm:
+   // New exponent >= 0x7ff means that result is inf
+   set $p0 0x1 ge s32 $r3 0x7ff
+   (not $p0) bra #rcp_result_denorm
+   sched 0x20 0x25 0x28 0x2b 0x23 0x25 0x2f
+   // Infinity
+   and b32 $r1 $r1 0x80000000
+   long mov b32 $r0 0x0
+   add b32 $r1 $r1 0x7ff00000
+   bra #rcp_end
+rcp_result_denorm:
+   // Denorm result comes from huge input. The greatest possible fp64, i.e.
+   // 0x7fefffffffffffff's rcp is 0x0004000000000000, 1/4 of the smallest
+   // normal value. Other rcp result should be greater than that. If we
+   // set the exponent field to 1, we can recover the result by multiplying
+   // it with 1/2 or 1/4. 1/2 is used if the "exponent" $r3 is 0, otherwise
+   // 1/4 ($r3 should be -1 then). This is quite tricky but greatly simplifies
+   // the logic here.
+   set $p0 0x1 ne u32 $r3 0x0
+   and b32 $r1 $r1 0x800fffff
+   // 0x3e800000: 1/4
+   $p0 cvt f64 $r6d f32 0x3e800000
+   sched 0x2f 0x28 0x2c 0x2e 0x2a 0x20 0x27
+   // 0x3f000000: 1/2
+   (not $p0) cvt f64 $r6d f32 0x3f000000
+   add b32 $r1 $r1 0x00100000
+   mul rn f64 $r0d $r0d $r6d
+rcp_end:
    long ret
 
 // RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i)
@@ -565,7 +704,67 @@ gk104_rcp_f64:
 // SIZE:    14 * 8 bytes
 //
 gk104_rsq_f64:
-   long nop
+   // Before getting initial result rsqrt64h, two special cases should be
+   // handled first.
+   // 1. NaN: set the highest bit in mantissa so it'll be surely recognized
+   //    as NaN in rsqrt64h
+   set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000
+   $p0 or b32 $r1 $r1 0x00080000
+   and b32 $r2 $r1 0x7fffffff
+   sched 0x27 0x20 0x28 0x2c 0x25 0x28 0x28
+   // 2. denorms and small normal values: using their original value will
+   //    lose precision either at rsqrt64h or the first step in newton-raphson
+   //    steps below. Take 2 as a threshold in exponent field, and multiply
+   //    with 2^54 if the exponent is smaller or equal. (will multiply 2^27
+   //    to recover in the end)
+   ext u32 $r3 $r1 0xb14
+   set $p1 0x1 le u32 $r3 0x2
+   long or b32 $r2 $r0 $r2
+   $p1 mul rn f64 $r0d $r0d 0x4350000000000000
+   rsqrt64h $r5 $r1
+   // rsqrt64h will give correct result for 0/inf/nan, the following logic
+   // checks whether the input is one of those (exponent is 0x7ff or all 0
+   // except for the sign bit)
+   set b32 $r6 ne u32 $r3 0x7ff
+   long and b32 $r2 $r2 $r6
+   sched 0x28 0x2b 0x20 0x27 0x28 0x2e 0x28
+   set $p0 0x1 ne u32 $r2 0x0
+   $p0 bra #rsq_norm
+   // For 0/inf/nan, make sure the sign bit agrees with input and return
+   and b32 $r1 $r1 0x80000000
+   long mov b32 $r0 0x0
+   long or b32 $r1 $r1 $r5
+   long ret
+rsq_norm:
+   // For others, do 4 Newton-Raphson steps with the formula:
+   //     RSQ_{n + 1} = RSQ_{n} * (1.5 - 0.5 * x * RSQ_{n} * RSQ_{n})
+   // In the code below, each step is written as:
+   //     tmp1 = 0.5 * x * RSQ_{n}
+   //     tmp2 = -RSQ_{n} * tmp1 + 0.5
+   //     RSQ_{n + 1} = RSQ_{n} * tmp2 + RSQ_{n}
+   long mov b32 $r4 0x0
+   sched 0x2f 0x29 0x29 0x29 0x29 0x29 0x29
+   // 0x3f000000: 1/2
+   cvt f64 $r8d f32 0x3f000000
+   mul rn f64 $r2d $r0d $r8d
+   mul rn f64 $r0d $r2d $r4d
+   fma rn f64 $r6d neg $r4d $r0d $r8d
+   fma rn f64 $r4d $r4d $r6d $r4d
+   mul rn f64 $r0d $r2d $r4d
+   fma rn f64 $r6d neg $r4d $r0d $r8d
+   sched 0x29 0x29 0x29 0x29 0x29 0x29 0x29
+   fma rn f64 $r4d $r4d $r6d $r4d
+   mul rn f64 $r0d $r2d $r4d
+   fma rn f64 $r6d neg $r4d $r0d $r8d
+   fma rn f64 $r4d $r4d $r6d $r4d
+   mul rn f64 $r0d $r2d $r4d
+   fma rn f64 $r6d neg $r4d $r0d $r8d
+   fma rn f64 $r4d $r4d $r6d $r4d
+   sched 0x29 0x20 0x28 0x2e 0x00 0x00 0x00
+   // Multiply 2^27 to result for small inputs to recover
+   $p1 mul rn f64 $r4d $r4d 0x41a0000000000000
+   long mov b32 $r1 $r5
+   long mov b32 $r0 $r4
    long ret
 
 //
diff --git a/src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h b/src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h
index 37998768efe..ed948dee471 100644
--- a/src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h
+++ b/src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h
@@ -481,12 +481,132 @@ uint64_t gk104_builtin_code[] = {
 	0xd40040000840c785,
 	0x18fe00000000dde2,
 	0x4000000000001de4,
-	0x9000000000001de7,
-/* 0x0f08: gk104_rcp_f64 */
+	0x2000000000000007,
 	0x4000000000001de4,
 	0x9000000000001de7,
-/* 0x0f18: gk104_rsq_f64 */
-	0x4000000000001de4,
+/* 0x0f18: gk104_rcp_f64 */
+	0x7000c02c50109c03,
+	0x0bfffffffc20dc02,
+	0x6000000280000007,
+	0x1a0ec01ff431dc03,
+	0x180000000000dde2,
+	0x228282f2b2d042f7,
+	0x40000000000021f4,
+	0x1bfffffffc00dde2,
+	0x1e0edffc0001dc81,
+	0x40000000200021e7,
+	0x3800200000105c52,
+/* 0x0f70: rcp_inf_or_denorm_or_zero */
+	0x39ffc00000111c02,
+	0x190e0000fc41dc23,
+	0x2202f2b2d2f042b7,
+	0x40000000400001e7,
+	0x39ffc00000105c82,
+	0x1800000000001df2,
+/* 0x0fa0: rcp_denorm_or_zero */
+	0x1e0ec0000001dc81,
+	0x40000000200001e7,
+	0x39ffc00000105c52,
+/* 0x0fb8: rcp_denorm */
+	0x5000d0d400001c01,
+	0x2280428282b282f7,
+	0x18000000d800ddf2,
+/* 0x0fd0: rcp_rejoin */
+	0x188e0000fc31dc23,
+	0x40000006000001e7,
+	0x7000c02c50109c03,
+	0x3a003ffffc11dc02,
+	0x08ffc0000071dc02,
+	0x2800000000019de4,
+	0x22e2b2a2828042b7,
+	0x1006000019a15c04,
+	0xc800000010511c00,
+	0x1afe000000001de2,
+	0x3000000014415c00,
+	0x3008000014401e00,
+	0x1000000001301c04,
+	0x1000000019b19d04,
+	0x22929292929292e7,
+	0x1000cfe001321c04,
+	0x2010000000611c01,
+	0x2000000010001c01,
+	0x2010000000611c01,
+	0x2000000010001c01,
+	0x2010000000611c01,
+	0x2000000010001c01,
+	0x2282828282820297,
+	0x2010000000611c01,
+	0x2000000010001c01,
+	0x0800000ffc209e02,
+	0x480000000c211c03,
+	0x7000c02c5010dc03,
+	0x480000001030dc03,
+	0x0bfffffffc309c02,
+	0x22b28282b282b287,
+	0x188ec01ff821dc03,
+	0x40000000600021e7,
+	0x6000c00050411c03,
+	0x4800000004405c03,
+	0x40000001c0001de7,
+/* 0x10f0: rcp_result_inf_or_denorm */
+	0x1b0ec01ffc31dc23,
+	0x40000000a00021e7,
+	0x22f25232b2825207,
+	0x3a00000000105c02,
+	0x1800000000001de2,
+	0x09ffc00000105c02,
+	0x40000000e0001de7,
+/* 0x1128: rcp_result_denorm */
+	0x1a8e0000fc31dc03,
+	0x3a003ffffc105c02,
+	0x1000cfa001318004,
+	0x227202a2e2c282f7,
+	0x1000cfc00131a004,
+	0x0800400000105c02,
+	0x5000000018001c01,
+/* 0x1160: rcp_end */
+	0x9000000000001de7,
+/* 0x1168: gk104_rsq_f64 */
+	0x1e0edffc0001dc81,
+	0x3800200000104042,
+	0x39fffffffc109c02,
+	0x22828252c2820277,
+	0x7000c02c5010dc03,
+	0x198ec0000833dc03,
+	0x6800000008009c43,
+	0x5000d0d400000401,
+	0xc80000001c115c00,
+	0x128ec01ffc319c03,
+	0x6800000018209c03,
+	0x2282e2827202b287,
+	0x1a8e0000fc21dc03,
+	0x40000000800001e7,
+	0x3a00000000105c02,
+	0x1800000000001de2,
+	0x6800000014105c43,
+	0x9000000000001de7,
+/* 0x11f8: rsq_norm */
+	0x1800000000011de2,
+	0x22929292929292f7,
+	0x1000cfc001321c04,
+	0x5000000020009c01,
+	0x5000000010201c01,
+	0x2010000000419e01,
+	0x2008000018411c01,
+	0x5000000010201c01,
+	0x2010000000419e01,
+	0x2292929292929297,
+	0x2008000018411c01,
+	0x5000000010201c01,
+	0x2010000000419e01,
+	0x2008000018411c01,
+	0x5000000010201c01,
+	0x2010000000419e01,
+	0x2008000018411c01,
+	0x20000002e2820297,
+	0x5000d06800410401,
+	0x2800000014005de4,
+	0x2800000010001de4,
 	0x9000000000001de7,
 	0xc800000003f01cc5,
 	0x2c00000100005c04,
@@ -495,7 +615,7 @@ uint64_t gk104_builtin_code[] = {
 	0x680100000c1fdc03,
 	0x4000000a60001c47,
 	0x180000004000dde2,
-/* 0x0f60: spill_cfstack */
+/* 0x12e0: spill_cfstack */
 	0x78000009c0000007,
 	0x0c0000000430dd02,
 	0x4003ffffa0001ca7,
@@ -543,14 +663,14 @@ uint64_t gk104_builtin_code[] = {
 	0x4000000100001ea7,
 	0x480100000c001c03,
 	0x0800000000105c42,
-/* 0x10d8: shared_loop */
+/* 0x1458: shared_loop */
 	0xc100000000309c85,
 	0x9400000500009c85,
 	0x0c00000010001d02,
 	0x0800000000105d42,
 	0x0c0000001030dd02,
 	0x4003ffff40001ca7,
-/* 0x1108: shared_done */
+/* 0x1488: shared_done */
 	0x2800406420001de4,
 	0x2800406430005de4,
 	0xe000000000001c45,
@@ -564,7 +684,7 @@ uint64_t gk104_builtin_code[] = {
 	0x480000000c209c03,
 	0x4801000008001c03,
 	0x0800000000105c42,
-/* 0x1170: search_cstack */
+/* 0x14f0: search_cstack */
 	0x280040646000dde4,
 	0x8400000020009f05,
 	0x190ec0002821dc03,
@@ -573,17 +693,17 @@ uint64_t gk104_builtin_code[] = {
 	0x0800000000105c42,
 	0x0c0000004030dd02,
 	0x00029dff0ffc5cbf,
-/* 0x11b0: entry_found */
+/* 0x1530: entry_found */
 	0x8400000000009f85,
 	0x2800406400001de4,
 	0x2800406410005de4,
 	0x9400000010009c85,
 	0x4000000000001df4,
-/* 0x11d8: end_exit */
+/* 0x1558: end_exit */
 	0x9800000003ffdcc5,
 	0xd000000000008007,
 	0xa000000000004007,
-/* 0x11f0: end_cont */
+/* 0x1570: end_cont */
 	0xd000000000008007,
 	0x3400c3fffc201c04,
 	0xc000000003f01ec5,
@@ -593,6 +713,6 @@ uint64_t gk104_builtin_code[] = {
 uint64_t gk104_builtin_offsets[] = {
 	0x0000000000000000,
 	0x00000000000000f0,
-	0x0000000000000f08,
 	0x0000000000000f18,
+	0x0000000000001168,
 };
diff --git a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm
index b9c05a04b9a..4047a565a9f 100644
--- a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm
+++ b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm
@@ -83,12 +83,229 @@ gk110_div_s32:
    $p0 sub b32 $r1 $r1 $r2
    $p0 add b32 $r0 $r0 0x1
    $p3 cvt s32 $r0 neg s32 $r0
-   sched 0x04 0x2e 0x04 0x28 0x04 0x20 0x2c
+   sched 0x04 0x2e 0x28 0x04 0x28 0x28 0x28
    $p2 cvt s32 $r1 neg s32 $r1
    ret
 
+// RCP F64
+//
+// INPUT:   $r0d
+// OUTPUT:  $r0d
+// CLOBBER: $r2 - $r9, $p0
+//
+// The core of RCP and RSQ implementation is Newton-Raphson step, which is
+// used to find successively better approximation from an imprecise initial
+// value (single precision rcp in RCP and rsqrt64h in RSQ).
+//
 gk110_rcp_f64:
+   // Step 1: classify input according to exponent and value, and calculate
+   // result for 0/inf/nan. $r2 holds the exponent value, which starts at
+   // bit 52 (bit 20 of the upper half) and is 11 bits in length
+   ext u32 $r2 $r1 0xb14
+   add b32 $r3 $r2 0xffffffff
+   joinat #rcp_rejoin
+   // We want to check whether the exponent is 0 or 0x7ff (i.e. NaN, inf,
+   // denorm, or 0). Do this by substracting 1 from the exponent, which will
+   // mean that it's > 0x7fd in those cases when doing unsigned comparison
+   set b32 $p0 0x1 gt u32 $r3 0x7fd
+   // $r3: 0 for norms, 0x36 for denorms, -1 for others
+   mov b32 $r3 0x0
+   sched 0x2f 0x04 0x2d 0x2b 0x2f 0x28 0x28
+   join (not $p0) nop
+   // Process all special values: NaN, inf, denorm, 0
+   mov b32 $r3 0xffffffff
+   // A number is NaN if its abs value is greater than or unordered with inf
+   set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000
+   (not $p0) bra #rcp_inf_or_denorm_or_zero
+   // NaN -> NaN, the next line sets the "quiet" bit of the result. This
+   // behavior is both seen on the CPU and the blob
+   join or b32 $r1 $r1 0x80000
+rcp_inf_or_denorm_or_zero:
+   and b32 $r4 $r1 0x7ff00000
+   // Other values with nonzero in exponent field should be inf
+   set b32 $p0 0x1 eq s32 $r4 0x0
+   sched 0x2b 0x04 0x2f 0x2d 0x2b 0x2f 0x20
+   $p0 bra #rcp_denorm_or_zero
+   // +/-Inf -> +/-0
+   xor b32 $r1 $r1 0x7ff00000
+   join mov b32 $r0 0x0
+rcp_denorm_or_zero:
+   set $p0 0x1 gtu f64 abs $r0d 0x0
+   $p0 bra #rcp_denorm
+   // +/-0 -> +/-Inf
+   join or b32 $r1 $r1 0x7ff00000
+rcp_denorm:
+   // non-0 denorms: multiply with 2^54 (the 0x36 in $r3), join with norms
+   mul rn f64 $r0d $r0d 0x4350000000000000
+   sched 0x2f 0x28 0x2b 0x28 0x28 0x04 0x28
+   join mov b32 $r3 0x36
+rcp_rejoin:
+   // All numbers with -1 in $r3 have their result ready in $r0d, return them
+   // others need further calculation
+   set b32 $p0 0x1 lt s32 $r3 0x0
+   $p0 bra #rcp_end
+   // Step 2: Before the real calculation goes on, renormalize the values to
+   // range [1, 2) by setting exponent field to 0x3ff (the exponent of 1)
+   // result in $r6d. The exponent will be recovered later.
+   ext u32 $r2 $r1 0xb14
+   and b32 $r7 $r1 0x800fffff
+   add b32 $r7 $r7 0x3ff00000
+   mov b32 $r6 $r0
+   sched 0x2b 0x04 0x28 0x28 0x2a 0x2b 0x2e
+   // Step 3: Convert new value to float (no overflow will occur due to step
+   // 2), calculate rcp and do newton-raphson step once
+   cvt rz f32 $r5 f64 $r6d
+   rcp f32 $r4 $r5
+   mov b32 $r0 0xbf800000
+   fma rn f32 $r5 $r4 $r5 $r0
+   fma rn f32 $r0 neg $r4 $r5 $r4
+   // Step 4: convert result $r0 back to double, do newton-raphson steps
+   cvt f64 $r0d f32 $r0
+   cvt f64 $r6d f64 neg $r6d
+   sched 0x2e 0x29 0x29 0x29 0x29 0x29 0x29
+   cvt f64 $r8d f32 0x3f800000
+   // 4 Newton-Raphson Steps, tmp in $r4d, result in $r0d
+   // The formula used here (and above) is:
+   //     RCP_{n + 1} = 2 * RCP_{n} - x * RCP_{n} * RCP_{n}
+   // The following code uses 2 FMAs for each step, and it will basically
+   // looks like:
+   //     tmp = -src * RCP_{n} + 1
+   //     RCP_{n + 1} = RCP_{n} * tmp + RCP_{n}
+   fma rn f64 $r4d $r6d $r0d $r8d
+   fma rn f64 $r0d $r0d $r4d $r0d
+   fma rn f64 $r4d $r6d $r0d $r8d
+   fma rn f64 $r0d $r0d $r4d $r0d
+   fma rn f64 $r4d $r6d $r0d $r8d
+   fma rn f64 $r0d $r0d $r4d $r0d
+   sched 0x29 0x20 0x28 0x28 0x28 0x28 0x28
+   fma rn f64 $r4d $r6d $r0d $r8d
+   fma rn f64 $r0d $r0d $r4d $r0d
+   // Step 5: Exponent recovery and final processing
+   // The exponent is recovered by adding what we added to the exponent.
+   // Suppose we want to calculate rcp(x), but we have rcp(cx), then
+   //     rcp(x) = c * rcp(cx)
+   // The delta in exponent comes from two sources:
+   //   1) The renormalization in step 2. The delta is:
+   //      0x3ff - $r2
+   //   2) (For the denorm input) The 2^54 we multiplied at rcp_denorm, stored
+   //      in $r3
+   // These 2 sources are calculated in the first two lines below, and then
+   // added to the exponent extracted from the result above.
+   // Note that after processing, the new exponent may >= 0x7ff (inf)
+   // or <= 0 (denorm). Those cases will be handled respectively below
+   subr b32 $r2 $r2 0x3ff
+   add b32 $r4 $r2 $r3
+   ext u32 $r3 $r1 0xb14
+   // New exponent in $r3
+   add b32 $r3 $r3 $r4
+   add b32 $r2 $r3 0xffffffff
+   sched 0x28 0x2b 0x28 0x2b 0x28 0x28 0x2b
+   // (exponent-1) < 0x7fe (unsigned) means the result is in norm range
+   // (same logic as in step 1)
+   set b32 $p0 0x1 lt u32 $r2 0x7fe
+   (not $p0) bra #rcp_result_inf_or_denorm
+   // Norms: convert exponents back and return
+   shl b32 $r4 $r4 clamp 0x14
+   add b32 $r1 $r4 $r1
+   bra #rcp_end
+rcp_result_inf_or_denorm:
+   // New exponent >= 0x7ff means that result is inf
+   set b32 $p0 0x1 ge s32 $r3 0x7ff
+   (not $p0) bra #rcp_result_denorm
+   sched 0x20 0x25 0x28 0x2b 0x23 0x25 0x2f
+   // Infinity
+   and b32 $r1 $r1 0x80000000
+   mov b32 $r0 0x0
+   add b32 $r1 $r1 0x7ff00000
+   bra #rcp_end
+rcp_result_denorm:
+   // Denorm result comes from huge input. The greatest possible fp64, i.e.
+   // 0x7fefffffffffffff's rcp is 0x0004000000000000, 1/4 of the smallest
+   // normal value. Other rcp result should be greater than that. If we
+   // set the exponent field to 1, we can recover the result by multiplying
+   // it with 1/2 or 1/4. 1/2 is used if the "exponent" $r3 is 0, otherwise
+   // 1/4 ($r3 should be -1 then). This is quite tricky but greatly simplifies
+   // the logic here.
+   set b32 $p0 0x1 ne u32 $r3 0x0
+   and b32 $r1 $r1 0x800fffff
+   // 0x3e800000: 1/4
+   $p0 cvt f64 $r6d f32 0x3e800000
+   sched 0x2f 0x28 0x2c 0x2e 0x2a 0x20 0x27
+   // 0x3f000000: 1/2
+   (not $p0) cvt f64 $r6d f32 0x3f000000
+   add b32 $r1 $r1 0x00100000
+   mul rn f64 $r0d $r0d $r6d
+rcp_end:
+   ret
+
+// RSQ F64
+//
+// INPUT:   $r0d
+// OUTPUT:  $r0d
+// CLOBBER: $r2 - $r9, $p0 - $p1
+//
 gk110_rsq_f64:
+   // Before getting initial result rsqrt64h, two special cases should be
+   // handled first.
+   // 1. NaN: set the highest bit in mantissa so it'll be surely recognized
+   //    as NaN in rsqrt64h
+   set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000
+   $p0 or b32 $r1 $r1 0x00080000
+   and b32 $r2 $r1 0x7fffffff
+   sched 0x27 0x20 0x28 0x2c 0x25 0x28 0x28
+   // 2. denorms and small normal values: using their original value will
+   //    lose precision either at rsqrt64h or the first step in newton-raphson
+   //    steps below. Take 2 as a threshold in exponent field, and multiply
+   //    with 2^54 if the exponent is smaller or equal. (will multiply 2^27
+   //    to recover in the end)
+   ext u32 $r3 $r1 0xb14
+   set b32 $p1 0x1 le u32 $r3 0x2
+   or b32 $r2 $r0 $r2
+   $p1 mul rn f64 $r0d $r0d 0x4350000000000000
+   rsqrt64h f32 $r5 $r1
+   // rsqrt64h will give correct result for 0/inf/nan, the following logic
+   // checks whether the input is one of those (exponent is 0x7ff or all 0
+   // except for the sign bit)
+   set b32 $r6 ne u32 $r3 0x7ff
+   and b32 $r2 $r2 $r6
+   sched 0x28 0x2b 0x20 0x27 0x28 0x2e 0x28
+   set b32 $p0 0x1 ne u32 $r2 0x0
+   $p0 bra #rsq_norm
+   // For 0/inf/nan, make sure the sign bit agrees with input and return
+   and b32 $r1 $r1 0x80000000
+   mov b32 $r0 0x0
+   or b32 $r1 $r1 $r5
+   ret
+rsq_norm:
+   // For others, do 4 Newton-Raphson steps with the formula:
+   //     RSQ_{n + 1} = RSQ_{n} * (1.5 - 0.5 * x * RSQ_{n} * RSQ_{n})
+   // In the code below, each step is written as:
+   //     tmp1 = 0.5 * x * RSQ_{n}
+   //     tmp2 = -RSQ_{n} * tmp1 + 0.5
+   //     RSQ_{n + 1} = RSQ_{n} * tmp2 + RSQ_{n}
+   mov b32 $r4 0x0
+   sched 0x2f 0x29 0x29 0x29 0x29 0x29 0x29
+   // 0x3f000000: 1/2
+   cvt f64 $r8d f32 0x3f000000
+   mul rn f64 $r2d $r0d $r8d
+   mul rn f64 $r0d $r2d $r4d
+   fma rn f64 $r6d neg $r4d $r0d $r8d
+   fma rn f64 $r4d $r4d $r6d $r4d
+   mul rn f64 $r0d $r2d $r4d
+   fma rn f64 $r6d neg $r4d $r0d $r8d
+   sched 0x29 0x29 0x29 0x29 0x29 0x29 0x29
+   fma rn f64 $r4d $r4d $r6d $r4d
+   mul rn f64 $r0d $r2d $r4d
+   fma rn f64 $r6d neg $r4d $r0d $r8d
+   fma rn f64 $r4d $r4d $r6d $r4d
+   mul rn f64 $r0d $r2d $r4d
+   fma rn f64 $r6d neg $r4d $r0d $r8d
+   fma rn f64 $r4d $r4d $r6d $r4d
+   sched 0x29 0x20 0x28 0x2e 0x00 0x00 0x00
+   // Multiply 2^27 to result for small inputs to recover
+   $p1 mul rn f64 $r4d $r4d 0x41a0000000000000
+   mov b32 $r1 $r5
+   mov b32 $r0 $r4
    ret
 
 .section #gk110_builtin_offsets
diff --git a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h
index 8d00e2a2245..3d1523f2fdd 100644
--- a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h
+++ b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h
@@ -65,11 +65,132 @@ uint64_t gk110_builtin_code[] = {
 	0xe088000001000406,
 	0x4000000000800001,
 	0xe6010000000ce802,
-	0x08b08010a010b810,
+	0x08a0a0a010a0b810,
 	0xe60100000088e806,
 	0x19000000001c003c,
 /* 0x0218: gk110_rcp_f64 */
-/* 0x0218: gk110_rsq_f64 */
+	0xc00000058a1c0409,
+	0x407fffffff9c080d,
+	0x1480000050000000,
+	0xb3401c03fe9c0c1d,
+	0xe4c03c007f9c000e,
+	0x08a0a0bcacb410bc,
+	0x8580000000603c02,
+	0x747fffffff9fc00e,
+	0xb4601fff801c021d,
+	0x120000000420003c,
+	0x21000400005c0404,
+/* 0x0270: rcp_inf_or_denorm_or_zero */
+	0x203ff800001c0410,
+	0xb3281c00001c101d,
+	0x0880bcacb4bc10ac,
+	0x120000000800003c,
+	0x223ff800001c0404,
+	0xe4c03c007fdc0002,
+/* 0x02a0: rcp_denorm_or_zero */
+	0xb4601c00001c021d,
+	0x120000000400003c,
+	0x213ff800005c0404,
+/* 0x02b8: rcp_denorm */
+	0xc400021a801c0001,
+	0x08a010a0a0aca0bc,
+	0x740000001b5fc00e,
+/* 0x02d0: rcp_rejoin */
+	0xb3181c00001c0c1d,
+	0x12000000c000003c,
+	0xc00000058a1c0409,
+	0x204007ffff9c041c,
+	0x401ff800001c1c1d,
+	0xe4c03c00001c001a,
+	0x08b8aca8a0a010ac,
+	0xe5400c00031c3816,
+	0x84000000021c1412,
+	0x745fc000001fc002,
+	0xcc000000029c1016,
+	0xcc081000029c1002,
+	0xe5400000001c2c02,
+	0xe5410000031c3c1a,
+	0x08a4a4a4a4a4a4b8,
+	0xc54001fc001c2c21,
+	0xdb802000001c1812,
+	0xdb800000021c0002,
+	0xdb802000001c1812,
+	0xdb800000021c0002,
+	0xdb802000001c1812,
+	0xdb800000021c0002,
+	0x08a0a0a0a0a080a4,
+	0xdb802000001c1812,
+	0xdb800000021c0002,
+	0x48000001ff9c0809,
+	0xe0800000019c0812,
+	0xc00000058a1c040d,
+	0xe0800000021c0c0e,
+	0x407fffffff9c0c09,
+	0x08aca0a0aca0aca0,
+	0xb3101c03ff1c081d,
+	0x120000000c20003c,
+	0xc24000000a1c1011,
+	0xe0800000009c1006,
+	0x12000000381c003c,
+/* 0x03f0: rcp_result_inf_or_denorm */
+	0xb3681c03ff9c0c1d,
+	0x120000001420003c,
+	0x08bc948caca09480,
+	0x20400000001c0404,
+	0xe4c03c007f9c0002,
+	0x403ff800001c0405,
+	0x120000001c1c003c,
+/* 0x0428: rcp_result_denorm */
+	0xb3501c00001c0c1d,
+	0x204007ffff9c0404,
+	0xc54001f400002c19,
+	0x089c80a8b8b0a0bc,
+	0xc54001f800202c19,
+	0x40000800001c0405,
+	0xe4000000031c0002,
+/* 0x0460: rcp_end */
+	0x19000000001c003c,
+/* 0x0468: gk110_rsq_f64 */
+	0xb4601fff801c021d,
+	0x2100040000000404,
+	0x203fffffff9c0408,
+	0x08a0a094b0a0809c,
+	0xc00000058a1c040d,
+	0xb3301c00011c0c3d,
+	0xe2001000011c000a,
+	0xc400021a80040001,
+	0x84000000039c0416,
+	0xb2d01c03ff9c0c19,
+	0xe2000000031c080a,
+	0x08a0b8a09c80aca0,
+	0xb3501c00001c081d,
+	0x120000001000003c,
+	0x20400000001c0404,
+	0xe4c03c007f9c0002,
+	0xe2001000029c0406,
+	0x19000000001c003c,
+/* 0x04f8: rsq_norm */
+	0xe4c03c007f9c0012,
+	0x08a4a4a4a4a4a4bc,
+	0xc54001f8001c2c21,
+	0xe4000000041c000a,
+	0xe4000000021c0802,
+	0xdb882000001c101a,
+	0xdb801000031c1012,
+	0xe4000000021c0802,
+	0xdb882000001c101a,
+	0x08a4a4a4a4a4a4a4,
+	0xdb801000031c1012,
+	0xe4000000021c0802,
+	0xdb882000001c101a,
+	0xdb801000031c1012,
+	0xe4000000021c0802,
+	0xdb882000001c101a,
+	0xdb801000031c1012,
+	0x08000000b8a080a4,
+	0xc400020d00041011,
+	0xe4c03c00029c0006,
+	0xe4c03c00021c0002,
 	0x19000000001c003c,
 };
 
@@ -77,5 +198,5 @@ uint64_t gk110_builtin_offsets[] = {
 	0x0000000000000000,
 	0x00000000000000f0,
 	0x0000000000000218,
-	0x0000000000000218,
+	0x0000000000000468,
 };
diff --git a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm
index 7ee5f8fc65b..faee0218d18 100644
--- a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm
+++ b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm
@@ -100,10 +100,253 @@ gm107_div_s32:
    ret
    nop 0
 
-// STUB
+// RCP F64
+//
+// INPUT:   $r0d
+// OUTPUT:  $r0d
+// CLOBBER: $r2 - $r9, $p0
+//
+// The core of RCP and RSQ implementation is Newton-Raphson step, which is
+// used to find successively better approximation from an imprecise initial
+// value (single precision rcp in RCP and rsqrt64h in RSQ).
+//
 gm107_rcp_f64:
-gm107_rsq_f64:
+   // Step 1: classify input according to exponent and value, and calculate
+   // result for 0/inf/nan. $r2 holds the exponent value, which starts at
+   // bit 52 (bit 20 of the upper half) and is 11 bits in length
+   sched (st 0x0) (st 0x0) (st 0x0)
+   bfe u32 $r2 $r1 0xb14
+   iadd32i $r3 $r2 -1
+   ssy #rcp_rejoin
+   // We want to check whether the exponent is 0 or 0x7ff (i.e. NaN, inf,
+   // denorm, or 0). Do this by substracting 1 from the exponent, which will
+   // mean that it's > 0x7fd in those cases when doing unsigned comparison
+   sched (st 0x0) (st 0x0) (st 0x0)
+   isetp gt u32 and $p0 1 $r3 0x7fd 1
+   // $r3: 0 for norms, 0x36 for denorms, -1 for others
+   mov $r3 0x0 0xf
+   not $p0 sync
+   // Process all special values: NaN, inf, denorm, 0
+   sched (st 0x0) (st 0x0) (st 0x0)
+   mov32i $r3 0xffffffff 0xf
+   // A number is NaN if its abs value is greater than or unordered with inf
+   dsetp gtu and $p0 1 abs $r0 0x7ff0000000000000 1
+   not $p0 bra #rcp_inf_or_denorm_or_zero
+   // NaN -> NaN, the next line sets the "quiet" bit of the result. This
+   // behavior is both seen on the CPU and the blob
+   sched (st 0x0) (st 0x0) (st 0x0)
+   lop32i or $r1 $r1 0x80000
+   sync
+rcp_inf_or_denorm_or_zero:
+   lop32i and $r4 $r1 0x7ff00000
+   sched (st 0x0) (st 0x0) (st 0x0)
+   // Other values with nonzero in exponent field should be inf
+   isetp eq and $p0 1 $r4 0x0 1
+   $p0 bra #rcp_denorm_or_zero
+   // +/-Inf -> +/-0
+   lop32i xor $r1 $r1 0x7ff00000
+   sched (st 0x0) (st 0x0) (st 0x0)
+   mov $r0 0x0 0xf
+   sync
+rcp_denorm_or_zero:
+   dsetp gtu and $p0 1 abs $r0 0x0 1
+   sched (st 0x0) (st 0x0) (st 0x0)
+   $p0 bra #rcp_denorm
+   // +/-0 -> +/-Inf
+   lop32i or $r1 $r1 0x7ff00000
+   sync
+rcp_denorm:
+   // non-0 denorms: multiply with 2^54 (the 0x36 in $r3), join with norms
+   sched (st 0x0) (st 0x0) (st 0x0)
+   dmul $r0 $r0 0x4350000000000000
+   mov $r3 0x36 0xf
+   sync
+rcp_rejoin:
+   // All numbers with -1 in $r3 have their result ready in $r0d, return them
+   // others need further calculation
+   sched (st 0x0) (st 0x0) (st 0x0)
+   isetp lt and $p0 1 $r3 0x0 1
+   $p0 bra #rcp_end
+   // Step 2: Before the real calculation goes on, renormalize the values to
+   // range [1, 2) by setting exponent field to 0x3ff (the exponent of 1)
+   // result in $r6d. The exponent will be recovered later.
+   bfe u32 $r2 $r1 0xb14
+   sched (st 0x0) (st 0x0) (st 0x0)
+   lop32i and $r7 $r1 0x800fffff
+   iadd32i $r7 $r7 0x3ff00000
+   mov $r6 $r0 0xf
+   // Step 3: Convert new value to float (no overflow will occur due to step
+   // 2), calculate rcp and do newton-raphson step once
+   sched (st 0x0) (st 0x0) (st 0x0)
+   f2f ftz f64 f32 $r5 $r6
+   mufu rcp $r4 $r5
+   mov32i $r0 0xbf800000 0xf
+   sched (st 0x0) (st 0x0) (st 0x0)
+   ffma $r5 $r4 $r5 $r0
+   ffma $r0 $r5 neg $r4 $r4
+   // Step 4: convert result $r0 back to double, do newton-raphson steps
+   f2f f32 f64 $r0 $r0
+   sched (st 0x0) (st 0x0) (st 0x0)
+   f2f f64 f64 $r6 neg $r6
+   f2f f32 f64 $r8 0x3f800000
+   // 4 Newton-Raphson Steps, tmp in $r4d, result in $r0d
+   // The formula used here (and above) is:
+   //     RCP_{n + 1} = 2 * RCP_{n} - x * RCP_{n} * RCP_{n}
+   // The following code uses 2 FMAs for each step, and it will basically
+   // looks like:
+   //     tmp = -src * RCP_{n} + 1
+   //     RCP_{n + 1} = RCP_{n} * tmp + RCP_{n}
+   dfma $r4 $r6 $r0 $r8
    sched (st 0x0) (st 0x0) (st 0x0)
+   dfma $r0 $r0 $r4 $r0
+   dfma $r4 $r6 $r0 $r8
+   dfma $r0 $r0 $r4 $r0
+   sched (st 0x0) (st 0x0) (st 0x0)
+   dfma $r4 $r6 $r0 $r8
+   dfma $r0 $r0 $r4 $r0
+   dfma $r4 $r6 $r0 $r8
+   sched (st 0x0) (st 0x0) (st 0x0)
+   dfma $r0 $r0 $r4 $r0
+   // Step 5: Exponent recovery and final processing
+   // The exponent is recovered by adding what we added to the exponent.
+   // Suppose we want to calculate rcp(x), but we have rcp(cx), then
+   //     rcp(x) = c * rcp(cx)
+   // The delta in exponent comes from two sources:
+   //   1) The renormalization in step 2. The delta is:
+   //      0x3ff - $r2
+   //   2) (For the denorm input) The 2^54 we multiplied at rcp_denorm, stored
+   //      in $r3
+   // These 2 sources are calculated in the first two lines below, and then
+   // added to the exponent extracted from the result above.
+   // Note that after processing, the new exponent may >= 0x7ff (inf)
+   // or <= 0 (denorm). Those cases will be handled respectively below
+   iadd $r2 neg $r2 0x3ff
+   iadd $r4 $r2 $r3
+   sched (st 0x0) (st 0x0) (st 0x0)
+   bfe u32 $r3 $r1 0xb14
+   // New exponent in $r3
+   iadd $r3 $r3 $r4
+   iadd32i $r2 $r3 -1
+   // (exponent-1) < 0x7fe (unsigned) means the result is in norm range
+   // (same logic as in step 1)
+   sched (st 0x0) (st 0x0) (st 0x0)
+   isetp lt u32 and $p0 1 $r2 0x7fe 1
+   not $p0 bra #rcp_result_inf_or_denorm
+   // Norms: convert exponents back and return
+   shl $r4 $r4 0x14
+   sched (st 0x0) (st 0x0) (st 0x0)
+   iadd $r1 $r4 $r1
+   bra #rcp_end
+rcp_result_inf_or_denorm:
+   // New exponent >= 0x7ff means that result is inf
+   isetp ge and $p0 1 $r3 0x7ff 1
+   sched (st 0x0) (st 0x0) (st 0x0)
+   not $p0 bra #rcp_result_denorm
+   // Infinity
+   lop32i and $r1 $r1 0x80000000
+   mov $r0 0x0 0xf
+   sched (st 0x0) (st 0x0) (st 0x0)
+   iadd32i $r1 $r1 0x7ff00000
+   bra #rcp_end
+rcp_result_denorm:
+   // Denorm result comes from huge input. The greatest possible fp64, i.e.
+   // 0x7fefffffffffffff's rcp is 0x0004000000000000, 1/4 of the smallest
+   // normal value. Other rcp result should be greater than that. If we
+   // set the exponent field to 1, we can recover the result by multiplying
+   // it with 1/2 or 1/4. 1/2 is used if the "exponent" $r3 is 0, otherwise
+   // 1/4 ($r3 should be -1 then). This is quite tricky but greatly simplifies
+   // the logic here.
+   isetp ne u32 and $p0 1 $r3 0x0 1
+   sched (st 0x0) (st 0x0) (st 0x0)
+   lop32i and $r1 $r1 0x800fffff
+   // 0x3e800000: 1/4
+   $p0 f2f f32 f64 $r6 0x3e800000
+   // 0x3f000000: 1/2
+   not $p0 f2f f32 f64 $r6 0x3f000000
+   sched (st 0x0) (st 0x0) (st 0x0)
+   iadd32i $r1 $r1 0x00100000
+   dmul $r0 $r0 $r6
+rcp_end:
+   ret
+
+// RSQ F64
+//
+// INPUT:   $r0d
+// OUTPUT:  $r0d
+// CLOBBER: $r2 - $r9, $p0 - $p1
+//
+gm107_rsq_f64:
+   // Before getting initial result rsqrt64h, two special cases should be
+   // handled first.
+   // 1. NaN: set the highest bit in mantissa so it'll be surely recognized
+   //    as NaN in rsqrt64h
+   sched (st 0xd wr 0x0 wt 0x3f) (st 0xd wt 0x1) (st 0xd)
+   dsetp gtu and $p0 1 abs $r0 0x7ff0000000000000 1
+   $p0 lop32i or $r1 $r1 0x00080000
+   lop32i and $r2 $r1 0x7fffffff
+   // 2. denorms and small normal values: using their original value will
+   //    lose precision either at rsqrt64h or the first step in newton-raphson
+   //    steps below. Take 2 as a threshold in exponent field, and multiply
+   //    with 2^54 if the exponent is smaller or equal. (will multiply 2^27
+   //    to recover in the end)
+   sched (st 0xd) (st 0xd) (st 0xd)
+   bfe u32 $r3 $r1 0xb14
+   isetp le u32 and $p1 1 $r3 0x2 1
+   lop or 1 $r2 $r0 $r2
+   sched (st 0xd wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xd)
+   $p1 dmul $r0 $r0 0x4350000000000000
+   mufu rsq64h $r5 $r1
+   // rsqrt64h will give correct result for 0/inf/nan, the following logic
+   // checks whether the input is one of those (exponent is 0x7ff or all 0
+   // except for the sign bit)
+   iset ne u32 and $r6 $r3 0x7ff 1
+   sched (st 0xd) (st 0xd) (st 0xd)
+   lop and 1 $r2 $r2 $r6
+   isetp ne u32 and $p0 1 $r2 0x0 1
+   $p0 bra #rsq_norm
+   // For 0/inf/nan, make sure the sign bit agrees with input and return
+   sched (st 0xd) (st 0xd) (st 0xd wt 0x1)
+   lop32i and $r1 $r1 0x80000000
+   mov $r0 0x0 0xf
+   lop or 1 $r1 $r1 $r5
+   sched (st 0xd) (st 0xf) (st 0xf)
+   ret
+   nop 0
+   nop 0
+rsq_norm:
+   // For others, do 4 Newton-Raphson steps with the formula:
+   //     RSQ_{n + 1} = RSQ_{n} * (1.5 - 0.5 * x * RSQ_{n} * RSQ_{n})
+   // In the code below, each step is written as:
+   //     tmp1 = 0.5 * x * RSQ_{n}
+   //     tmp2 = -RSQ_{n} * tmp1 + 0.5
+   //     RSQ_{n + 1} = RSQ_{n} * tmp2 + RSQ_{n}
+   sched (st 0xd) (st 0xd wr 0x1) (st 0xd wr 0x1 rd 0x0 wt 0x3)
+   mov $r4 0x0 0xf
+   // 0x3f000000: 1/2
+   f2f f32 f64 $r8 0x3f000000
+   dmul $r2 $r0 $r8
+   sched (st 0xd wr 0x0 wt 0x3) (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1)
+   dmul $r0 $r2 $r4
+   dfma $r6 $r0 neg $r4 $r8
+   dfma $r4 $r4 $r6 $r4
+   sched (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1)
+   dmul $r0 $r2 $r4
+   dfma $r6 $r0 neg $r4 $r8
+   dfma $r4 $r4 $r6 $r4
+   sched (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1)
+   dmul $r0 $r2 $r4
+   dfma $r6 $r0 neg $r4 $r8
+   dfma $r4 $r4 $r6 $r4
+   sched (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1)
+   dmul $r0 $r2 $r4
+   dfma $r6 $r0 neg $r4 $r8
+   dfma $r4 $r4 $r6 $r4
+   // Multiply 2^27 to result for small inputs to recover
+   sched (st 0xd wr 0x0 wt 0x1) (st 0xd wt 0x1) (st 0xd)
+   $p1 dmul $r4 $r4 0x41a0000000000000
+   mov $r1 $r5 0xf
+   mov $r0 $r4 0xf
+   sched (st 0xd) (st 0xf) (st 0xf)
    ret
    nop 0
    nop 0
diff --git a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h
index 65c93f7ae89..8eb27bbac99 100644
--- a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h
+++ b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h
@@ -82,8 +82,156 @@ uint64_t gm107_builtin_code[] = {
 	0xe32000000007000f,
 	0x50b0000000070f00,
 /* 0x0280: gm107_rcp_f64 */
-/* 0x0280: gm107_rsq_f64 */
 	0x001f8000fc0007e0,
+	0x38000000b1470102,
+	0x1c0ffffffff70203,
+	0xe29000000e000000,
+	0x001f8000fc0007e0,
+	0x366803807fd70307,
+	0x5c9807800ff70003,
+	0xf0f800000008000f,
+	0x001f8000fc0007e0,
+	0x010ffffffff7f003,
+	0x368c03fff0070087,
+	0xe24000000188000f,
+	0x001f8000fc0007e0,
+	0x0420008000070101,
+	0xf0f800000007000f,
+/* 0x02f8: rcp_inf_or_denorm_or_zero */
+	0x0407ff0000070104,
+	0x001f8000fc0007e0,
+	0x5b6503800ff70407,
+	0xe24000000200000f,
+	0x0447ff0000070101,
+	0x001f8000fc0007e0,
+	0x5c9807800ff70000,
+	0xf0f800000007000f,
+/* 0x0338: rcp_denorm_or_zero */
+	0x5b8c03800ff70087,
+	0x001f8000fc0007e0,
+	0xe24000000100000f,
+	0x0427ff0000070101,
+	0xf0f800000007000f,
+/* 0x0360: rcp_denorm */
+	0x001f8000fc0007e0,
+	0x3880004350070000,
+	0x3898078003670003,
+	0xf0f800000007000f,
+/* 0x0380: rcp_rejoin */
+	0x001f8000fc0007e0,
+	0x5b6303800ff70307,
+	0xe24000001c00000f,
+	0x38000000b1470102,
+	0x001f8000fc0007e0,
+	0x040800fffff70107,
+	0x1c03ff0000070707,
+	0x5c98078000070006,
+	0x001f8000fc0007e0,
+	0x5ca8100000670e05,
+	0x5080000000470504,
+	0x010bf8000007f000,
+	0x001f8000fc0007e0,
+	0x5980000000570405,
+	0x5981020000470500,
+	0x5ca8000000070b00,
+	0x001f8000fc0007e0,
+	0x5ca8200000670f06,
+	0x38a8003f80070b08,
+	0x5b70040000070604,
+	0x001f8000fc0007e0,
+	0x5b70000000470000,
+	0x5b70040000070604,
+	0x5b70000000470000,
+	0x001f8000fc0007e0,
+	0x5b70040000070604,
+	0x5b70000000470000,
+	0x5b70040000070604,
+	0x001f8000fc0007e0,
+	0x5b70000000470000,
+	0x381200003ff70202,
+	0x5c10000000370204,
+	0x001f8000fc0007e0,
+	0x38000000b1470103,
+	0x5c10000000470303,
+	0x1c0ffffffff70302,
+	0x001f8000fc0007e0,
+	0x366203807fe70207,
+	0xe24000000208000f,
+	0x3848000001470404,
+	0x001f8000fc0007e0,
+	0x5c10000000170401,
+	0xe24000000807000f,
+/* 0x04d8: rcp_result_inf_or_denorm */
+	0x366d03807ff70307,
+	0x001f8000fc0007e0,
+	0xe24000000288000f,
+	0x0408000000070101,
+	0x5c9807800ff70000,
+	0x001f8000fc0007e0,
+	0x1c07ff0000070101,
+	0xe24000000407000f,
+/* 0x0518: rcp_result_denorm */
+	0x5b6a03800ff70307,
+	0x001f8000fc0007e0,
+	0x040800fffff70101,
+	0x38a8003e80000b06,
+	0x38a8003f00080b06,
+	0x001f8000fc0007e0,
+	0x1c00010000070101,
+	0x5c80000000670000,
+/* 0x0558: rcp_end */
+	0xe32000000007000f,
+/* 0x0560: gm107_rsq_f64 */
+	0x001fb401fda1ff0d,
+	0x368c03fff0070087,
+	0x0420008000000101,
+	0x0407fffffff70102,
+	0x001fb400fda007ed,
+	0x38000000b1470103,
+	0x366603800027030f,
+	0x5c47020000270002,
+	0x001fb401e1a0070d,
+	0x3880004350010000,
+	0x5080000000770105,
+	0x365a03807ff70306,
+	0x001fb400fda007ed,
+	0x5c47000000670202,
+	0x5b6a03800ff70207,
+	0xe24000000400000f,
+	0x003fb400fda007ed,
+	0x0408000000070101,
+	0x5c9807800ff70000,
+	0x5c47020000570101,
+	0x001fbc00fde007ed,
+	0xe32000000007000f,
+	0x50b0000000070f00,
+	0x50b0000000070f00,
+/* 0x0620: rsq_norm */
+	0x0060b400e5a007ed,
+	0x5c9807800ff70004,
+	0x38a8003f00070b08,
+	0x5c80000000870002,
+	0x003c3401e1a01f0d,
+	0x5c80000000470200,
+	0x5b71040000470006,
+	0x5b70020000670404,
+	0x003c3401e1a00f0d,
+	0x5c80000000470200,
+	0x5b71040000470006,
+	0x5b70020000670404,
+	0x003c3401e1a00f0d,
+	0x5c80000000470200,
+	0x5b71040000470006,
+	0x5b70020000670404,
+	0x003c3401e1a00f0d,
+	0x5c80000000470200,
+	0x5b71040000470006,
+	0x5b70020000670404,
+	0x001fb401fda00f0d,
+	0x38800041a0010404,
+	0x5c98078000570001,
+	0x5c98078000470000,
+	0x001fbc00fde007ed,
 	0xe32000000007000f,
 	0x50b0000000070f00,
 	0x50b0000000070f00,
@@ -93,5 +241,5 @@ uint64_t gm107_builtin_offsets[] = {
 	0x0000000000000000,
 	0x0000000000000120,
 	0x0000000000000280,
-	0x0000000000000280,
+	0x0000000000000560,
 };
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
index 49425b98b91..993d01c1e44 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
@@ -1119,6 +1119,7 @@ Program::Program(Type type, Target *arch)
    binSize = 0;
 
    maxGPR = -1;
+   fp64 = false;
 
    main = new Function(this, "MAIN", ~0);
    calls.insert(&main->call);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
index 8085bb2f542..8d32a25ec23 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
@@ -1311,6 +1311,7 @@ class Program
    uint32_t tlsSize; // size required for FILE_MEMORY_LOCAL
 
    int maxGPR;
+   bool fp64;
 
    MemoryPool mem_Instruction;
    MemoryPool mem_CmpInstruction;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index afd7916a321..335e708c5cb 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -1087,6 +1087,8 @@ class Source
    };
    std::vector<MemoryFile> memoryFiles;
 
+   std::vector<bool> bufferAtomics;
+
 private:
    int inferSysValDirection(unsigned sn) const;
    bool scanDeclaration(const struct tgsi_full_declaration *);
@@ -1137,6 +1139,7 @@ bool Source::scanSource()
    //resources.resize(scan.file_max[TGSI_FILE_RESOURCE] + 1);
    tempArrayId.resize(scan.file_max[TGSI_FILE_TEMPORARY] + 1);
    memoryFiles.resize(scan.file_max[TGSI_FILE_MEMORY] + 1);
+   bufferAtomics.resize(scan.file_max[TGSI_FILE_BUFFER] + 1);
 
    info->immd.bufSize = 0;
 
@@ -1483,11 +1486,14 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
          tempArrayInfo.insert(std::make_pair(arrayId, std::make_pair(
                                                    first, last - first + 1)));
       break;
+   case TGSI_FILE_BUFFER:
+      for (i = first; i <= last; ++i)
+         bufferAtomics[i] = decl->Declaration.Atomic;
+      break;
    case TGSI_FILE_ADDRESS:
    case TGSI_FILE_CONSTANT:
    case TGSI_FILE_IMMEDIATE:
    case TGSI_FILE_SAMPLER:
-   case TGSI_FILE_BUFFER:
    case TGSI_FILE_IMAGE:
       break;
    default:
@@ -2720,7 +2726,11 @@ Converter::handleLOAD(Value *dst0[4])
          }
 
          Instruction *ld = mkLoad(TYPE_U32, dst0[c], sym, off);
-         ld->cache = tgsi.getCacheMode();
+         if (tgsi.getSrc(0).getFile() == TGSI_FILE_BUFFER &&
+             code->bufferAtomics[r])
+            ld->cache = nv50_ir::CACHE_CG;
+         else
+            ld->cache = tgsi.getCacheMode();
          if (ind)
             ld->setIndirect(0, 1, ind);
       }
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index 295497be2f9..346a98228bd 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -83,6 +83,38 @@ NVC0LegalizeSSA::handleDIV(Instruction *i)
    delete_Instruction(prog, i);
 }
 
+void
+NVC0LegalizeSSA::handleRCPRSQLib(Instruction *i, Value *src[])
+{
+   FlowInstruction *call;
+   Value *def[2];
+   int builtin;
+
+   def[0] = bld.mkMovToReg(0, src[0])->getDef(0);
+   def[1] = bld.mkMovToReg(1, src[1])->getDef(0);
+
+   if (i->op == OP_RCP)
+      builtin = NVC0_BUILTIN_RCP_F64;
+   else
+      builtin = NVC0_BUILTIN_RSQ_F64;
+
+   call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL);
+   def[0] = bld.getSSA();
+   def[1] = bld.getSSA();
+   bld.mkMovFromReg(def[0], 0);
+   bld.mkMovFromReg(def[1], 1);
+   bld.mkClobber(FILE_GPR, 0x3fc, 2);
+   bld.mkClobber(FILE_PREDICATE, i->op == OP_RSQ ? 0x3 : 0x1, 0);
+   bld.mkOp2(OP_MERGE, TYPE_U64, i->getDef(0), def[0], def[1]);
+
+   call->fixed = 1;
+   call->absolute = call->builtin = 1;
+   call->target.builtin = builtin;
+   delete_Instruction(prog, i);
+
+   prog->fp64 = true;
+}
+
 void
 NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
 {
@@ -96,6 +128,12 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
    Value *src[2], *dst[2], *def = i->getDef(0);
    bld.mkSplit(src, 4, i->getSrc(0));
 
+   int chip = prog->getTarget()->getChipset();
+   if (chip >= NVISA_GK104_CHIPSET) {
+      handleRCPRSQLib(i, src);
+      return;
+   }
+
    // 2. We don't care about the low 32 bits of the destination. Stick a 0 in.
    dst[0] = bld.loadImm(NULL, 0);
    dst[1] = bld.getSSA();
@@ -1063,22 +1101,6 @@ NVC0LoweringPass::handleTEX(TexInstruction *i)
       }
    }
 
-   if (chipset >= NVISA_GK104_CHIPSET) {
-      //
-      // If TEX requires more than 4 sources, the 2nd register tuple must be
-      // aligned to 4, even if it consists of just a single 4-byte register.
-      //
-      // XXX HACK: We insert 0 sources to avoid the 5 or 6 regs case.
-      //
-      int s = i->srcCount(0xff, true);
-      if (s > 4 && s < 7) {
-         if (i->srcExists(s)) // move potential predicate out of the way
-            i->moveSources(s, 7 - s);
-         while (s < 7)
-            i->setSrc(s++, bld.loadImm(NULL, 0));
-      }
-   }
-
    return true;
 }
 
@@ -1887,7 +1909,8 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
       su->op == OP_SULDB || su->op == OP_SUSTB || su->op == OP_SUREDB;
    const int slot = su->tex.r;
    const int dim = su->tex.target.getDim();
-   const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
+   const bool array = su->tex.target.isArray() || su->tex.target.isCube();
+   const int arg = dim + array;
    int c;
    Value *zero = bld.mkImm(0);
    Value *p1 = NULL;
@@ -1896,6 +1919,7 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
    Value *bf, *eau, *off;
    Value *addr, *pred;
    Value *ind = su->getIndirectR();
+   Value *y, *z;
 
    off = bld.getScratch(4);
    bf = bld.getScratch(4);
@@ -1926,34 +1950,42 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
    for (; c < 3; ++c)
       src[c] = zero;
 
+   if (dim == 2 && !array) {
+      v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);
+      src[2] = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(),
+                          v, bld.loadImm(NULL, 16));
+
+      v = loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM(2), su->tex.bindless);
+      bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[2], src[2], v, zero)
+         ->subOp = NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
+   }
+
    // set predicate output
    if (su->tex.target == TEX_TARGET_BUFFER) {
       src[0]->getInsn()->setFlagsDef(1, pred);
    } else
-   if (su->tex.target.isArray() || su->tex.target.isCube()) {
+   if (array) {
       p1 = bld.getSSA(1, FILE_PREDICATE);
       src[dim]->getInsn()->setFlagsDef(1, p1);
    }
 
    // calculate pixel offset
    if (dim == 1) {
+      y = z = zero;
       if (su->tex.target != TEX_TARGET_BUFFER)
          bld.mkOp2(OP_AND, TYPE_U32, off, src[0], bld.loadImm(NULL, 0xffff));
-   } else
-   if (dim == 3) {
+   } else {
+      y = src[1];
+      z = src[2];
+
       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);
       bld.mkOp3(OP_MADSP, TYPE_U32, off, src[2], v, src[1])
-         ->subOp = NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
+         ->subOp = NV50_IR_SUBOP_MADSP(4,4,8); // u16l u16l u16l
 
       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_PITCH, su->tex.bindless);
       bld.mkOp3(OP_MADSP, TYPE_U32, off, off, v, src[0])
-         ->subOp = NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l
-   } else {
-      assert(dim == 2);
-      v = loadSuInfo32(ind, slot, NVC0_SU_INFO_PITCH, su->tex.bindless);
-      bld.mkOp3(OP_MADSP, TYPE_U32, off, src[1], v, src[0])
-         ->subOp = (su->tex.target.isArray() || su->tex.target.isCube()) ?
-         NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
+         ->subOp = array ?
+         NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l
    }
 
    // calculate effective address part 1
@@ -1966,19 +1998,15 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
             ->subOp = NV50_IR_SUBOP_V1(7,6,8|2);
       }
    } else {
-      Value *y = src[1];
-      Value *z = src[2];
       uint16_t subOp = 0;
 
       switch (dim) {
       case 1:
-         y = zero;
-         z = zero;
          break;
       case 2:
-         z = off;
-         if (!su->tex.target.isArray() && !su->tex.target.isCube()) {
-            z = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);
+         if (array) {
+            z = off;
+         } else {
             subOp = NV50_IR_SUBOP_SUBFM_3D;
          }
          break;
@@ -2001,7 +2029,7 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
       eau = bld.mkOp3v(OP_SUEAU, TYPE_U32, bld.getScratch(4), off, bf, v);
    }
    // add array layer offset
-   if (su->tex.target.isArray() || su->tex.target.isCube()) {
+   if (array) {
       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ARRAY, su->tex.bindless);
       if (dim == 1)
          bld.mkOp3(OP_MADSP, TYPE_U32, eau, src[1], v, eau)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
index e0f50ab0904..99809726602 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
@@ -62,6 +62,7 @@ class NVC0LegalizeSSA : public Pass
 
    // we want to insert calls to the builtin library only after optimization
    void handleDIV(Instruction *); // integer division, modulus
+   void handleRCPRSQLib(Instruction *, Value *[]);
    void handleRCPRSQ(Instruction *); // double precision float recip/rsqrt
    void handleFTZ(Instruction *);
    void handleSET(CmpInstruction *);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
index f4379c137c5..f25bce00884 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
@@ -2341,9 +2341,19 @@ RegAlloc::InsertConstraintsPass::texConstraintGM107(TexInstruction *tex)
             if (!tex->tex.target.isArray() && tex->tex.useOffsets)
                s++;
          }
-         n = tex->srcCount(0xff) - s;
+         n = tex->srcCount(0xff, true) - s;
+         // TODO: Is this necessary? Perhaps just has to be aligned to the
+         // level that the first arg is, not necessarily to 4. This
+         // requirement has not been rigorously verified, as it has been on
+         // Kepler.
+         if (n > 0 && n < 3) {
+            if (tex->srcExists(n + s)) // move potential predicate out of the way
+               tex->moveSources(n + s, 3 - n);
+            while (n < 3)
+               tex->setSrc(s + n++, new_LValue(func, FILE_GPR));
+         }
       } else {
-         s = tex->srcCount(0xff);
+         s = tex->srcCount(0xff, true);
          n = 0;
       }
 
@@ -2366,14 +2376,18 @@ RegAlloc::InsertConstraintsPass::texConstraintNVE0(TexInstruction *tex)
    } else
    if (isTextureOp(tex->op)) {
       int n = tex->srcCount(0xff, true);
-      if (n > 4) {
-         condenseSrcs(tex, 0, 3);
-         if (n > 5) // NOTE: first call modified positions already
-            condenseSrcs(tex, 4 - (4 - 1), n - 1 - (4 - 1));
-      } else
-      if (n > 1) {
-         condenseSrcs(tex, 0, n - 1);
+      int s = n > 4 ? 4 : n;
+      if (n > 4 && n < 7) {
+         if (tex->srcExists(n)) // move potential predicate out of the way
+            tex->moveSources(n, 7 - n);
+
+         while (n < 7)
+            tex->setSrc(n++, new_LValue(func, FILE_GPR));
       }
+      if (s > 1)
+         condenseSrcs(tex, 0, s - 1);
+      if (n > 4)
+         condenseSrcs(tex, 1, n - s);
    }
 }
 
@@ -2510,6 +2524,7 @@ RegAlloc::InsertConstraintsPass::insertConstraintMove(Instruction *cst, int s)
    assert(cst->getSrc(s)->defs.size() == 1); // still SSA
 
    Instruction *defi = cst->getSrc(s)->defs.front()->getInsn();
+
    bool imm = defi->op == OP_MOV &&
       defi->src(0).getFile() == FILE_IMMEDIATE;
    bool load = defi->op == OP_LOAD &&
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
index 9193a01f189..5c6d0570ae2 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
@@ -399,6 +399,7 @@ Program::emitBinary(struct nv50_ir_prog_info *info)
          }
       }
    }
+   info->io.fp64 |= fp64;
    info->bin.relocData = emit->getRelocInfo();
    info->bin.fixupData = emit->getFixupInfo();
 
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index 2b69a8f6968..53551ebc037 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -79,6 +79,9 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
       return 2048;
    case PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET:
       return 8 * 1024 * 1024;
+   case PIPE_CAP_MAX_VARYINGS:
+      return 8;
+
    /* supported capabilities */
    case PIPE_CAP_ANISOTROPIC_FILTER:
    case PIPE_CAP_POINT_SPRITE:
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query.c b/src/gallium/drivers/nouveau/nv50/nv50_query.c
index e30380cd84d..13088ebb5fa 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_query.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query.c
@@ -98,12 +98,10 @@ nv50_render_condition(struct pipe_context *pipe,
       case PIPE_QUERY_OCCLUSION_COUNTER:
       case PIPE_QUERY_OCCLUSION_PREDICATE:
       case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+         if (hq->state == NV50_HW_QUERY_STATE_READY)
+            wait = true;
          if (likely(!condition)) {
-            if (unlikely(hq->nesting))
-               cond = wait ? NV50_3D_COND_MODE_NOT_EQUAL :
-                             NV50_3D_COND_MODE_ALWAYS;
-            else
-               cond = NV50_3D_COND_MODE_RES_NON_ZERO;
+            cond = wait ? NV50_3D_COND_MODE_NOT_EQUAL : NV50_3D_COND_MODE_ALWAYS;
          } else {
             cond = wait ? NV50_3D_COND_MODE_EQUAL : NV50_3D_COND_MODE_ALWAYS;
          }
@@ -129,7 +127,7 @@ nv50_render_condition(struct pipe_context *pipe,
 
    PUSH_SPACE(push, 9);
 
-   if (wait) {
+   if (wait && hq->state != NV50_HW_QUERY_STATE_READY) {
       BEGIN_NV04(push, SUBC_3D(NV50_GRAPH_SERIALIZE), 1);
       PUSH_DATA (push, 0);
    }
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c
index ac3e409b2d5..4e74c462235 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c
@@ -29,11 +29,6 @@
 #include "nv50/nv50_query_hw_sm.h"
 #include "nv_object.xml.h"
 
-#define NV50_HW_QUERY_STATE_READY   0
-#define NV50_HW_QUERY_STATE_ACTIVE  1
-#define NV50_HW_QUERY_STATE_ENDED   2
-#define NV50_HW_QUERY_STATE_FLUSHED 3
-
 /* XXX: Nested queries, and simultaneous queries on multiple gallium contexts
  * (since we use only a single GPU channel per screen) will not work properly.
  *
@@ -158,8 +153,7 @@ nv50_hw_begin_query(struct nv50_context *nv50, struct nv50_query *q)
    case PIPE_QUERY_OCCLUSION_COUNTER:
    case PIPE_QUERY_OCCLUSION_PREDICATE:
    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
-      hq->nesting = nv50->screen->num_occlusion_queries_active++;
-      if (hq->nesting) {
+      if (nv50->screen->num_occlusion_queries_active++) {
          nv50_hw_query_get(push, q, 0x10, 0x0100f002);
       } else {
          PUSH_SPACE(push, 4);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.h b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.h
index 82ec6bd2d96..a89a66cec4f 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.h
@@ -6,6 +6,11 @@
 
 #include "nv50_query.h"
 
+#define NV50_HW_QUERY_STATE_READY   0
+#define NV50_HW_QUERY_STATE_ACTIVE  1
+#define NV50_HW_QUERY_STATE_ENDED   2
+#define NV50_HW_QUERY_STATE_FLUSHED 3
+
 #define NVA0_HW_QUERY_STREAM_OUTPUT_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0)
 
 struct nv50_hw_query;
@@ -29,7 +34,6 @@ struct nv50_hw_query {
    uint8_t state;
    bool is64bit;
    uint8_t rotate;
-   int nesting; /* only used for occlusion queries */
    struct nouveau_mm_allocation *mm;
    struct nouveau_fence *fence;
 };
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index 0c53b22eb3c..8e65eaf50b1 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -156,6 +156,8 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
       return NV50_MAX_WINDOW_RECTANGLES;
    case PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET:
       return 16 * 1024 * 1024;
+   case PIPE_CAP_MAX_VARYINGS:
+      return 15;
 
    /* supported caps */
    case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
@@ -215,6 +217,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_TGSI_CLOCK:
    case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX:
    case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION:
+   case PIPE_CAP_DEST_SURFACE_SRGB_CONTROL:
       return 1;
    case PIPE_CAP_SEAMLESS_CUBE_MAP:
       return 1; /* class_3d >= NVA0_3D_CLASS; */
@@ -312,6 +315,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_TGSI_ATOMFADD:
    case PIPE_CAP_QUERY_PIPELINE_STATISTICS_SINGLE:
    case PIPE_CAP_RGB_OVERRIDE_DST_ALPHA_BLEND:
+   case PIPE_CAP_GLSL_TESS_LEVELS_AS_INPUTS:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
index eb057bf2489..c1351062676 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@@ -434,6 +434,7 @@ nvc0_video_buffer_create(struct pipe_context *pipe,
 
 /* nvc0_push.c */
 void nvc0_push_vbo(struct nvc0_context *, const struct pipe_draw_info *);
+void nvc0_push_vbo_indirect(struct nvc0_context *, const struct pipe_draw_info *);
 
 /* nve4_compute.c */
 void nve4_launch_grid(struct pipe_context *, const struct pipe_grid_info *);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
index 1a3e4e794c0..40af9936859 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
@@ -121,12 +121,10 @@ nvc0_render_condition(struct pipe_context *pipe,
       case PIPE_QUERY_OCCLUSION_COUNTER:
       case PIPE_QUERY_OCCLUSION_PREDICATE:
       case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+         if (hq->state == NVC0_HW_QUERY_STATE_READY)
+            wait = true;
          if (likely(!condition)) {
-            if (unlikely(hq->nesting))
-               cond = wait ? NVC0_3D_COND_MODE_NOT_EQUAL :
-                             NVC0_3D_COND_MODE_ALWAYS;
-            else
-               cond = NVC0_3D_COND_MODE_RES_NON_ZERO;
+            cond = wait ? NVC0_3D_COND_MODE_NOT_EQUAL : NVC0_3D_COND_MODE_ALWAYS;
          } else {
             cond = wait ? NVC0_3D_COND_MODE_EQUAL : NVC0_3D_COND_MODE_ALWAYS;
          }
@@ -151,7 +149,7 @@ nvc0_render_condition(struct pipe_context *pipe,
       return;
    }
 
-   if (wait)
+   if (wait && hq->state != NVC0_HW_QUERY_STATE_READY)
       nvc0_hw_query_fifo_wait(nvc0, q);
 
    PUSH_SPACE(push, 10);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
index a420ed4ac0d..f6d5d0f5602 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
@@ -28,11 +28,6 @@
 #include "nvc0/nvc0_query_hw_metric.h"
 #include "nvc0/nvc0_query_hw_sm.h"
 
-#define NVC0_HW_QUERY_STATE_READY   0
-#define NVC0_HW_QUERY_STATE_ACTIVE  1
-#define NVC0_HW_QUERY_STATE_ENDED   2
-#define NVC0_HW_QUERY_STATE_FLUSHED 3
-
 #define NVC0_HW_QUERY_ALLOC_SPACE 256
 
 bool
@@ -158,14 +153,18 @@ nvc0_hw_begin_query(struct nvc0_context *nvc0, struct nvc0_query *q)
    case PIPE_QUERY_OCCLUSION_COUNTER:
    case PIPE_QUERY_OCCLUSION_PREDICATE:
    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
-      hq->nesting = nvc0->screen->num_occlusion_queries_active++;
-      if (hq->nesting) {
+      if (nvc0->screen->num_occlusion_queries_active++) {
          nvc0_hw_query_get(push, q, 0x10, 0x0100f002);
       } else {
          PUSH_SPACE(push, 3);
          BEGIN_NVC0(push, NVC0_3D(COUNTER_RESET), 1);
          PUSH_DATA (push, NVC0_3D_COUNTER_RESET_SAMPLECNT);
          IMMED_NVC0(push, NVC0_3D(SAMPLECNT_ENABLE), 1);
+         /* Given that the counter is reset, the contents at 0x10 are
+          * equivalent to doing the query -- we would get hq->sequence as the
+          * payload and 0 as the reported value. This is already set up above
+          * as in the hq->rotate case.
+          */
       }
       break;
    case PIPE_QUERY_PRIMITIVES_GENERATED:
@@ -199,6 +198,7 @@ nvc0_hw_begin_query(struct nvc0_context *nvc0, struct nvc0_query *q)
       nvc0_hw_query_get(push, q, 0xc0 + 0x70, 0x0980a002); /* ROP, PIXELS */
       nvc0_hw_query_get(push, q, 0xc0 + 0x80, 0x0d808002); /* TCP, LAUNCHES */
       nvc0_hw_query_get(push, q, 0xc0 + 0x90, 0x0e809002); /* TEP, LAUNCHES */
+      ((uint64_t *)hq->data)[(12 + 10) * 2] = 0;
       break;
    default:
       break;
@@ -271,6 +271,7 @@ nvc0_hw_end_query(struct nvc0_context *nvc0, struct nvc0_query *q)
       nvc0_hw_query_get(push, q, 0x70, 0x0980a002); /* ROP, PIXELS */
       nvc0_hw_query_get(push, q, 0x80, 0x0d808002); /* TCP, LAUNCHES */
       nvc0_hw_query_get(push, q, 0x90, 0x0e809002); /* TEP, LAUNCHES */
+      ((uint64_t *)hq->data)[10 * 2] = 0;
       break;
    case PIPE_QUERY_TIMESTAMP_DISJOINT:
       /* This query is not issued on GPU because disjoint is forced to false */
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h
index 8225755d85e..5c8ad5eb2d0 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h
@@ -6,6 +6,11 @@
 
 #include "nvc0_query.h"
 
+#define NVC0_HW_QUERY_STATE_READY   0
+#define NVC0_HW_QUERY_STATE_ACTIVE  1
+#define NVC0_HW_QUERY_STATE_ENDED   2
+#define NVC0_HW_QUERY_STATE_FLUSHED 3
+
 #define NVC0_HW_QUERY_TFB_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0)
 
 struct nvc0_hw_query;
@@ -29,7 +34,6 @@ struct nvc0_hw_query {
    uint8_t state;
    boolean is64bit;
    uint8_t rotate;
-   int nesting; /* only used for occlusion queries */
    struct nouveau_mm_allocation *mm;
    struct nouveau_fence *fence;
 };
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 68b5869276a..553fe324bc7 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -182,6 +182,13 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
       return class_3d >= GM200_3D_CLASS ? 8 : 0;
    case PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET:
       return 64 * 1024 * 1024;
+   case PIPE_CAP_MAX_VARYINGS:
+      /* NOTE: These only count our slots for GENERIC varyings.
+       * The address space may be larger, but the actual hard limit seems to be
+       * less than what the address space layout permits, so don't add TEXCOORD,
+       * COLOR, etc. here.
+       */
+      return 0x1f0 / 16;
 
    /* supported caps */
    case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
@@ -266,6 +273,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX:
    case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION:
    case PIPE_CAP_QUERY_SO_OVERFLOW:
+   case PIPE_CAP_DEST_SURFACE_SRGB_CONTROL:
       return 1;
    case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
       return nouveau_screen(pscreen)->vram_domain & NOUVEAU_BO_VRAM ? 1 : 0;
@@ -336,6 +344,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_SURFACE_SAMPLE_COUNT:
    case PIPE_CAP_QUERY_PIPELINE_STATISTICS_SINGLE:
    case PIPE_CAP_RGB_OVERRIDE_DST_ALPHA_BLEND:
+   case PIPE_CAP_GLSL_TESS_LEVELS_AS_INPUTS:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -392,18 +401,6 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen,
    case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH:
       return 16;
    case PIPE_SHADER_CAP_MAX_INPUTS:
-      if (shader == PIPE_SHADER_VERTEX)
-         return 32;
-      /* NOTE: These only count our slots for GENERIC varyings.
-       * The address space may be larger, but the actual hard limit seems to be
-       * less than what the address space layout permits, so don't add TEXCOORD,
-       * COLOR, etc. here.
-       */
-      if (shader == PIPE_SHADER_FRAGMENT)
-         return 0x1f0 / 16;
-      /* Actually this counts CLIPVERTEX, which occupies the last generic slot,
-       * and excludes 0x60 per-patch inputs.
-       */
       return 0x200 / 16;
    case PIPE_SHADER_CAP_MAX_OUTPUTS:
       return 32;
@@ -1286,8 +1283,8 @@ nvc0_screen_create(struct nouveau_device *dev)
    for (i = 0; i < NVC0_MAX_VIEWPORTS; i++) {
       BEGIN_NVC0(push, NVC0_3D(SCISSOR_ENABLE(i)), 3);
       PUSH_DATA (push, 1);
-      PUSH_DATA (push, 8192 << 16);
-      PUSH_DATA (push, 8192 << 16);
+      PUSH_DATA (push, 16384 << 16);
+      PUSH_DATA (push, 16384 << 16);
    }
 
 #define MK_MACRO(m, n) i = nvc0_graph_set_macro(screen, m, i, sizeof(n), n);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
index 04f0a0d55da..8820b5aac66 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
@@ -1051,21 +1051,13 @@ nve4_set_surface_info(struct nouveau_pushbuf *push,
    } else {
       struct nv50_miptree *mt = nv50_miptree(&res->base);
       struct nv50_miptree_level *lvl = &mt->level[view->u.tex.level];
-      const unsigned z = view->u.tex.first_layer;
-
-      if (z) {
-         if (mt->layout_3d) {
-            address += nvc0_mt_zslice_offset(mt, view->u.tex.level, z);
-            /* doesn't work if z passes z-tile boundary */
-            if (depth > 1) {
-               pipe_debug_message(&nvc0->base.debug, CONFORMANCE,
-                                  "3D images are not really supported!");
-               debug_printf("3D images are not really supported!\n");
-            }
-         } else {
-            address += mt->layer_stride * z;
-         }
+      unsigned z = view->u.tex.first_layer;
+
+      if (!mt->layout_3d) {
+         address += mt->layer_stride * z;
+         z = 0;
       }
+
       address += lvl->offset;
 
       info[0]  = address >> 8;
@@ -1080,7 +1072,8 @@ nve4_set_surface_info(struct nouveau_pushbuf *push,
       info[6]  = depth - 1;
       info[6] |= (lvl->tile_mode & 0xf00) << 21;
       info[6] |= NVC0_TILE_SHIFT_Z(lvl->tile_mode) << 22;
-      info[7]  = 0;
+      info[7]  = mt->layout_3d ? 1 : 0;
+      info[7] |= z << 16;
       info[14] = mt->ms_x;
       info[15] = mt->ms_y;
    }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
index 3fbe7614e52..7d6be9382d1 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
@@ -1040,7 +1040,10 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    }
 
    if (nvc0->state.vbo_mode) {
-      nvc0_push_vbo(nvc0, info);
+      if (info->indirect)
+         nvc0_push_vbo_indirect(nvc0, info);
+      else
+         nvc0_push_vbo(nvc0, info);
       goto cleanup;
    }
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c
index 256e20df2e4..4333fb26d23 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c
@@ -466,6 +466,83 @@ nvc0_prim_gl(unsigned prim)
    }
 }
 
+typedef struct {
+   uint32_t count;
+   uint32_t primCount;
+   uint32_t first;
+   uint32_t baseInstance;
+} DrawArraysIndirectCommand;
+
+typedef struct {
+   uint32_t count;
+   uint32_t primCount;
+   uint32_t firstIndex;
+   int32_t  baseVertex;
+   uint32_t baseInstance;
+} DrawElementsIndirectCommand;
+
+void
+nvc0_push_vbo_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
+{
+   /* The strategy here is to just read the commands from the indirect buffer
+    * and do the draws. This is suboptimal, but will only happen in the case
+    * that conversion is required for FIXED or DOUBLE inputs.
+    */
+   struct nvc0_screen *screen = nvc0->screen;
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nv04_resource *buf = nv04_resource(info->indirect->buffer);
+   struct nv04_resource *buf_count = nv04_resource(info->indirect->indirect_draw_count);
+   unsigned i;
+
+   unsigned draw_count = info->indirect->draw_count;
+   if (buf_count) {
+      uint32_t *count = nouveau_resource_map_offset(
+            &nvc0->base, buf_count, info->indirect->indirect_draw_count_offset,
+            NOUVEAU_BO_RD);
+      draw_count = *count;
+   }
+
+   uint8_t *buf_data = nouveau_resource_map_offset(
+            &nvc0->base, buf, info->indirect->offset, NOUVEAU_BO_RD);
+   struct pipe_draw_info single = *info;
+   single.indirect = NULL;
+   for (i = 0; i < draw_count; i++, buf_data += info->indirect->stride) {
+      if (info->index_size) {
+         DrawElementsIndirectCommand *cmd = (void *)buf_data;
+         single.start = info->start + cmd->firstIndex;
+         single.count = cmd->count;
+         single.start_instance = cmd->baseInstance;
+         single.instance_count = cmd->primCount;
+         single.index_bias = cmd->baseVertex;
+      } else {
+         DrawArraysIndirectCommand *cmd = (void *)buf_data;
+         single.start = cmd->first;
+         single.count = cmd->count;
+         single.start_instance = cmd->baseInstance;
+         single.instance_count = cmd->primCount;
+      }
+
+      if (nvc0->vertprog->vp.need_draw_parameters) {
+         PUSH_SPACE(push, 9);
+         BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
+         PUSH_DATA (push, NVC0_CB_AUX_SIZE);
+         PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0));
+         PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0));
+         BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 3);
+         PUSH_DATA (push, NVC0_CB_AUX_DRAW_INFO);
+         PUSH_DATA (push, single.index_bias);
+         PUSH_DATA (push, single.start_instance);
+         PUSH_DATA (push, single.drawid + i);
+      }
+
+      nvc0_push_vbo(nvc0, &single);
+   }
+
+   nouveau_resource_unmap(buf);
+   if (buf_count)
+      nouveau_resource_unmap(buf_count);
+}
+
 void
 nvc0_push_vbo(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
 {
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index 19d3a1bae30..be0b475e5ef 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -304,6 +304,9 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
         case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
             return 2048;
 
+        case PIPE_CAP_MAX_VARYINGS:
+            return 10;
+
         case PIPE_CAP_VENDOR_ID:
                 return 0x1002;
         case PIPE_CAP_DEVICE_ID:
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index ade1a94ab32..41a878ab9d2 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -536,6 +536,9 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_MAX_TEXEL_OFFSET:
 		return 7;
 
+	case PIPE_CAP_MAX_VARYINGS:
+		return 32;
+
 	case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK:
 		return PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_R600;
 	case PIPE_CAP_ENDIANNESS:
diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c
index f86764f5220..96ffbf82927 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -1837,18 +1837,17 @@ static void r600_emit_sampler_states(struct r600_context *rctx,
 
 		/* TEX_ARRAY_OVERRIDE must be set for array textures to disable
 		 * filtering between layers.
-		 * Don't update TEX_ARRAY_OVERRIDE if we don't have the sampler view.
 		 */
-		if (rview) {
-			enum pipe_texture_target target = rview->base.texture->target;
-			if (target == PIPE_TEXTURE_1D_ARRAY ||
-			    target == PIPE_TEXTURE_2D_ARRAY) {
-				rstate->tex_sampler_words[0] |= S_03C000_TEX_ARRAY_OVERRIDE(1);
-				texinfo->is_array_sampler[i] = true;
-			} else {
-				rstate->tex_sampler_words[0] &= C_03C000_TEX_ARRAY_OVERRIDE;
-				texinfo->is_array_sampler[i] = false;
-			}
+		enum pipe_texture_target target = PIPE_BUFFER;
+		if (rview)
+			target = rview->base.texture->target;
+		if (target == PIPE_TEXTURE_1D_ARRAY ||
+		    target == PIPE_TEXTURE_2D_ARRAY) {
+			rstate->tex_sampler_words[0] |= S_03C000_TEX_ARRAY_OVERRIDE(1);
+			texinfo->is_array_sampler[i] = true;
+		} else {
+			rstate->tex_sampler_words[0] &= C_03C000_TEX_ARRAY_OVERRIDE;
+			texinfo->is_array_sampler[i] = false;
 		}
 
 		radeon_emit(cs, PKT3(PKT3_SET_SAMPLER, 3, 0));
diff --git a/src/gallium/drivers/radeon/radeon_vcn_dec.c b/src/gallium/drivers/radeon/radeon_vcn_dec.c
index a4e6d9dc6b5..688cef90103 100644
--- a/src/gallium/drivers/radeon/radeon_vcn_dec.c
+++ b/src/gallium/drivers/radeon/radeon_vcn_dec.c
@@ -64,6 +64,7 @@ static rvcn_dec_message_avc_t get_h264_msg(struct radeon_decoder *dec,
 	memset(&result, 0, sizeof(result));
 	switch (pic->base.profile) {
 	case PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE:
+	case PIPE_VIDEO_PROFILE_MPEG4_AVC_CONSTRAINED_BASELINE:
 		result.profile = RDECODE_H264_PROFILE_BASELINE;
 		break;
 
@@ -490,7 +491,7 @@ static rvcn_dec_message_vp9_t get_vp9_msg(struct radeon_decoder *dec,
 
 	assert(dec->base.max_references + 1 <= 16);
 
-	for (i = 0 ; i < dec->base.max_references + 1 ; ++i) {
+	for (i = 0 ; i < 16 ; ++i) {
 		if (dec->render_pic_list[i] && dec->render_pic_list[i] == target) {
 			result.curr_pic_idx =
 				(uintptr_t)vl_video_buffer_get_associated_data(target, &dec->base);
diff --git a/src/gallium/drivers/radeonsi/Makefile.sources b/src/gallium/drivers/radeonsi/Makefile.sources
index 713629c6e87..3cdd0851a5c 100644
--- a/src/gallium/drivers/radeonsi/Makefile.sources
+++ b/src/gallium/drivers/radeonsi/Makefile.sources
@@ -14,6 +14,7 @@ C_SOURCES := \
 	si_compute_blit.c \
 	si_cp_dma.c \
 	si_debug.c \
+	si_debug_options.h \
 	si_descriptors.c \
 	si_dma.c \
 	si_dma_cs.c \
diff --git a/src/gallium/drivers/radeonsi/driinfo_radeonsi.h b/src/gallium/drivers/radeonsi/driinfo_radeonsi.h
index cbf3bb01fb3..000a300746e 100644
--- a/src/gallium/drivers/radeonsi/driinfo_radeonsi.h
+++ b/src/gallium/drivers/radeonsi/driinfo_radeonsi.h
@@ -11,5 +11,14 @@ DRI_CONF_SECTION_PERFORMANCE
 DRI_CONF_SECTION_END
 
 DRI_CONF_SECTION_DEBUG
-   DRI_CONF_RADEONSI_CLEAR_DB_CACHE_BEFORE_CLEAR("false")
+
+//= BEGIN VERBATIM
+#define OPT_BOOL(name, dflt, description) \
+	DRI_CONF_OPT_BEGIN_B(radeonsi_##name, #dflt) \
+		DRI_CONF_DESC(en, description) \
+	DRI_CONF_OPT_END
+
+#include "radeonsi/si_debug_options.h"
+//= END VERBATIM
+
 DRI_CONF_SECTION_END
diff --git a/src/gallium/drivers/radeonsi/si_buffer.c b/src/gallium/drivers/radeonsi/si_buffer.c
index 03c11cb7013..3845e56a4b3 100644
--- a/src/gallium/drivers/radeonsi/si_buffer.c
+++ b/src/gallium/drivers/radeonsi/si_buffer.c
@@ -521,10 +521,13 @@ static void si_buffer_do_flush_region(struct pipe_context *ctx,
 	struct si_resource *buf = si_resource(transfer->resource);
 
 	if (stransfer->staging) {
+		unsigned src_offset = stransfer->offset +
+				      transfer->box.x % SI_MAP_BUFFER_ALIGNMENT +
+				      (box->x - transfer->box.x);
+
 		/* Copy the staging buffer into the original one. */
 		si_copy_buffer((struct si_context*)ctx, transfer->resource,
-			       &stransfer->staging->b.b, box->x,
-			       stransfer->offset + box->x % SI_MAP_BUFFER_ALIGNMENT,
+			       &stransfer->staging->b.b, box->x, src_offset,
 			       box->width);
 	}
 
diff --git a/src/gallium/drivers/radeonsi/si_clear.c b/src/gallium/drivers/radeonsi/si_clear.c
index 9026f61dc0a..ef25c79fa9c 100644
--- a/src/gallium/drivers/radeonsi/si_clear.c
+++ b/src/gallium/drivers/radeonsi/si_clear.c
@@ -272,7 +272,7 @@ void vi_dcc_clear_level(struct si_context *sctx,
 	}
 
 	si_clear_buffer(sctx, dcc_buffer, dcc_offset, clear_size,
-			&clear_value, 4, SI_COHERENCY_CB_META);
+			&clear_value, 4, SI_COHERENCY_CB_META, false);
 }
 
 /* Set the same micro tile mode as the destination of the last MSAA resolve.
@@ -505,7 +505,7 @@ static void si_do_fast_color_clear(struct si_context *sctx,
 				uint32_t clear_value = 0xCCCCCCCC;
 				si_clear_buffer(sctx, &tex->cmask_buffer->b.b,
 						tex->cmask_offset, tex->surface.cmask_size,
-						&clear_value, 4, SI_COHERENCY_CB_META);
+						&clear_value, 4, SI_COHERENCY_CB_META, false);
 				fmask_decompress_needed = true;
 			}
 
@@ -533,7 +533,7 @@ static void si_do_fast_color_clear(struct si_context *sctx,
 			uint32_t clear_value = 0;
 			si_clear_buffer(sctx, &tex->cmask_buffer->b.b,
 					tex->cmask_offset, tex->surface.cmask_size,
-					&clear_value, 4, SI_COHERENCY_CB_META);
+					&clear_value, 4, SI_COHERENCY_CB_META, false);
 			eliminate_needed = true;
 		}
 
@@ -647,7 +647,7 @@ static void si_clear(struct pipe_context *ctx, unsigned buffers,
 		 * This hack decreases back-to-back ClearDepth performance.
 		 */
 		if ((sctx->db_depth_clear || sctx->db_stencil_clear) &&
-		    sctx->screen->clear_db_cache_before_clear)
+		    sctx->screen->options.clear_db_cache_before_clear)
 			sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB;
 	}
 
diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c
index 38c48c30be9..304296c4a52 100644
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@@ -177,7 +177,8 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx,
 
 void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
 		     uint64_t offset, uint64_t size, uint32_t *clear_value,
-		     uint32_t clear_value_size, enum si_coherency coher)
+		     uint32_t clear_value_size, enum si_coherency coher,
+		     bool force_cpdma)
 {
 	if (!size)
 		return;
@@ -241,7 +242,8 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
 		 * about buffer placements.
 		 */
 		if (clear_value_size > 4 ||
-		    (clear_value_size == 4 &&
+		    (!force_cpdma &&
+		     clear_value_size == 4 &&
 		     offset % 4 == 0 &&
 		     (size > 32*1024 || sctx->chip_class <= VI))) {
 			si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0,
@@ -282,7 +284,7 @@ static void si_pipe_clear_buffer(struct pipe_context *ctx,
 		coher = SI_COHERENCY_SHADER;
 
 	si_clear_buffer((struct si_context*)ctx, dst, offset, size, (uint32_t*)clear_value,
-			clear_value_size, coher);
+			clear_value_size, coher, false);
 }
 
 void si_copy_buffer(struct si_context *sctx,
diff --git a/src/gallium/drivers/radeonsi/si_debug_options.h b/src/gallium/drivers/radeonsi/si_debug_options.h
new file mode 100644
index 00000000000..165dba8baf5
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/si_debug_options.h
@@ -0,0 +1,4 @@
+OPT_BOOL(clear_db_cache_before_clear, false, "Clear DB cache before fast depth clear")
+OPT_BOOL(enable_nir, false, "Enable NIR")
+
+#undef OPT_BOOL
diff --git a/src/gallium/drivers/radeonsi/si_get.c b/src/gallium/drivers/radeonsi/si_get.c
index bb2d8c09eb1..ff25a976e77 100644
--- a/src/gallium/drivers/radeonsi/si_get.c
+++ b/src/gallium/drivers/radeonsi/si_get.c
@@ -208,7 +208,7 @@ static int si_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 				RADEON_SPARSE_PAGE_SIZE : 0;
 
 	case PIPE_CAP_PACKED_UNIFORMS:
-		if (sscreen->debug_flags & DBG(NIR))
+		if (sscreen->options.enable_nir)
 			return 1;
 		return 0;
 
@@ -254,6 +254,9 @@ static int si_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
 		return 30;
 
+	case PIPE_CAP_MAX_VARYINGS:
+		return 32;
+
 	case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK:
 		return sscreen->info.chip_class <= VI ?
 			PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_R600 : 0;
@@ -420,11 +423,11 @@ static int si_get_shader_param(struct pipe_screen* pscreen,
 	case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
 		return SI_NUM_IMAGES;
 	case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
-		if (sscreen->debug_flags & DBG(NIR))
+		if (sscreen->options.enable_nir)
 			return 0;
 		return 32;
 	case PIPE_SHADER_CAP_PREFERRED_IR:
-		if (sscreen->debug_flags & DBG(NIR))
+		if (sscreen->options.enable_nir)
 			return PIPE_SHADER_IR_NIR;
 		return PIPE_SHADER_IR_TGSI;
 	case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
diff --git a/src/gallium/drivers/radeonsi/si_perfcounter.c b/src/gallium/drivers/radeonsi/si_perfcounter.c
index 2da14f8868f..d55394f2cba 100644
--- a/src/gallium/drivers/radeonsi/si_perfcounter.c
+++ b/src/gallium/drivers/radeonsi/si_perfcounter.c
@@ -1333,7 +1333,7 @@ void si_init_perfcounters(struct si_screen *screen)
 	for (i = 0; i < num_blocks; ++i) {
 		struct si_pc_block *block = &pc->blocks[i];
 		block->b = &blocks[i];
-		block->num_instances = block->b->instances;
+		block->num_instances = MAX2(1, block->b->instances);
 
 		if (!strcmp(block->b->b->name, "CB") ||
 		    !strcmp(block->b->b->name, "DB"))
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 41d395d7d3f..507ca65605f 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -61,7 +61,6 @@ static const struct debug_named_value debug_options[] = {
 
 	/* Shader compiler options (with no effect on the shader cache): */
 	{ "checkir", DBG(CHECK_IR), "Enable additional sanity checks on shader IR" },
-	{ "nir", DBG(NIR), "Enable experimental NIR shaders" },
 	{ "mono", DBG(MONOLITHIC_SHADERS), "Use old-style monolithic shaders compiled on demand" },
 	{ "nooptvariant", DBG(NO_OPT_VARIANT), "Disable compiling optimized shader variants." },
 
@@ -609,11 +608,14 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
 	si_begin_new_gfx_cs(sctx);
 
 	if (sctx->chip_class == CIK) {
-		/* Clear the NULL constant buffer, because loads should return zeros. */
+		/* Clear the NULL constant buffer, because loads should return zeros.
+		 * Note that this forces CP DMA to be used, because clover deadlocks
+		 * for some reason when the compute codepath is used.
+		 */
 		uint32_t clear_value = 0;
 		si_clear_buffer(sctx, sctx->null_const_buf.buffer, 0,
 				sctx->null_const_buf.buffer->width0,
-				&clear_value, 4, SI_COHERENCY_SHADER);
+				&clear_value, 4, SI_COHERENCY_SHADER, true);
 	}
 	return &sctx->b;
 fail:
@@ -804,8 +806,7 @@ static void si_disk_cache_create(struct si_screen *sscreen)
 	#define ALL_FLAGS (DBG(FS_CORRECT_DERIVS_AFTER_KILL) |	\
 			   DBG(SI_SCHED) |			\
 			   DBG(GISEL) |				\
-			   DBG(UNSAFE_MATH) |			\
-			   DBG(NIR))
+			   DBG(UNSAFE_MATH))
 	uint64_t shader_debug_flags = sscreen->debug_flags &
 		ALL_FLAGS;
 
@@ -813,7 +814,11 @@ static void si_disk_cache_create(struct si_screen *sscreen)
 	 * how 32-bit addresses are expanded to 64 bits.
 	 */
 	STATIC_ASSERT(ALL_FLAGS <= UINT_MAX);
-	shader_debug_flags |= (uint64_t)sscreen->info.address32_hi << 32;
+	assert((int16_t)sscreen->info.address32_hi == (int32_t)sscreen->info.address32_hi);
+	shader_debug_flags |= (uint64_t)(sscreen->info.address32_hi & 0xffff) << 32;
+
+	if (sscreen->options.enable_nir)
+		shader_debug_flags |= 1ull << 48;
 
 	sscreen->disk_shader_cache =
 		disk_cache_create(sscreen->info.name,
@@ -866,7 +871,6 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws,
 	if (driQueryOptionb(config->options, "radeonsi_enable_sisched"))
 		sscreen->debug_flags |= DBG(SI_SCHED);
 
-
 	if (sscreen->debug_flags & DBG(INFO))
 		ac_print_gpu_info(&sscreen->info);
 
@@ -1013,8 +1017,16 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws,
 		driQueryOptionb(config->options, "radeonsi_assume_no_z_fights");
 	sscreen->commutative_blend_add =
 		driQueryOptionb(config->options, "radeonsi_commutative_blend_add");
-	sscreen->clear_db_cache_before_clear =
-		driQueryOptionb(config->options, "radeonsi_clear_db_cache_before_clear");
+
+	{
+#define OPT_BOOL(name, dflt, description) \
+		sscreen->options.name = \
+			driQueryOptionb(config->options, "radeonsi_"#name);
+#include "si_debug_options.h"
+	}
+
+	sscreen->has_gfx9_scissor_bug = sscreen->info.family == CHIP_VEGA10 ||
+					sscreen->info.family == CHIP_RAVEN;
 	sscreen->has_msaa_sample_loc_bug = (sscreen->info.family >= CHIP_POLARIS10 &&
 					    sscreen->info.family <= CHIP_POLARIS12) ||
 					   sscreen->info.family == CHIP_VEGA10 ||
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index eb3ba951dae..ea009622970 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -133,7 +133,6 @@ enum {
 
 	/* Shader compiler options (with no effect on the shader cache): */
 	DBG_CHECK_IR,
-	DBG_NIR,
 	DBG_MONOLITHIC_SHADERS,
 	DBG_NO_OPT_VARIANT,
 
@@ -445,7 +444,7 @@ struct si_screen {
 	bool				has_out_of_order_rast;
 	bool				assume_no_z_fights;
 	bool				commutative_blend_add;
-	bool				clear_db_cache_before_clear;
+	bool				has_gfx9_scissor_bug;
 	bool				has_msaa_sample_loc_bug;
 	bool				has_ls_vgpr_init_bug;
 	bool				has_dcc_constant_encode;
@@ -453,6 +452,11 @@ struct si_screen {
 	bool				dfsm_allowed;
 	bool				llvm_has_working_vgpr_indexing;
 
+	struct {
+#define OPT_BOOL(name, dflt, description) bool name:1;
+#include "si_debug_options.h"
+	} options;
+
 	/* Whether shaders are monolithic (1-part) or separate (3-part). */
 	bool				use_monolithic_shaders;
 	bool				record_llvm_ir;
@@ -1054,7 +1058,7 @@ struct si_context {
 	unsigned			num_resident_handles;
 	uint64_t			num_alloc_tex_transfer_bytes;
 	unsigned			last_tex_ps_draw_ratio; /* for query */
-	unsigned			context_roll_counter;
+	unsigned			context_roll;
 
 	/* Queries. */
 	/* Maintain the list of active queries for pausing between IBs. */
@@ -1168,7 +1172,8 @@ unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
 			    enum si_cache_policy cache_policy);
 void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
 		     uint64_t offset, uint64_t size, uint32_t *clear_value,
-		     uint32_t clear_value_size, enum si_coherency coher);
+		     uint32_t clear_value_size, enum si_coherency coher,
+		     bool force_cpdma);
 void si_copy_buffer(struct si_context *sctx,
 		    struct pipe_resource *dst, struct pipe_resource *src,
 		    uint64_t dst_offset, uint64_t src_offset, unsigned size);
diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c
index 266b9d3ce84..280eee3a280 100644
--- a/src/gallium/drivers/radeonsi/si_query.c
+++ b/src/gallium/drivers/radeonsi/si_query.c
@@ -549,11 +549,15 @@ void si_query_buffer_reset(struct si_context *sctx, struct si_query_buffer *buff
 	}
 	buffer->results_end = 0;
 
+	if (!buffer->buf)
+		return;
+
 	/* Discard even the oldest buffer if it can't be mapped without a stall. */
-	if (buffer->buf &&
-	    (si_rings_is_buffer_referenced(sctx, buffer->buf->buf, RADEON_USAGE_READWRITE) ||
-	     !sctx->ws->buffer_wait(buffer->buf->buf, 0, RADEON_USAGE_READWRITE))) {
+	if (si_rings_is_buffer_referenced(sctx, buffer->buf->buf, RADEON_USAGE_READWRITE) ||
+	    !sctx->ws->buffer_wait(buffer->buf->buf, 0, RADEON_USAGE_READWRITE)) {
 		si_resource_reference(&buffer->buf, NULL);
+	} else {
+		buffer->unprepared = true;
 	}
 }
 
@@ -561,29 +565,31 @@ bool si_query_buffer_alloc(struct si_context *sctx, struct si_query_buffer *buff
 			   bool (*prepare_buffer)(struct si_context *, struct si_query_buffer*),
 			   unsigned size)
 {
-	if (buffer->buf && buffer->results_end + size >= buffer->buf->b.b.width0)
-		return true;
+	bool unprepared = buffer->unprepared;
+	buffer->unprepared = false;
+
+	if (!buffer->buf || buffer->results_end + size > buffer->buf->b.b.width0) {
+		if (buffer->buf) {
+			struct si_query_buffer *qbuf = MALLOC_STRUCT(si_query_buffer);
+			memcpy(qbuf, buffer, sizeof(*qbuf));
+			buffer->previous = qbuf;
+		}
+		buffer->results_end = 0;
 
-	if (buffer->buf) {
-		struct si_query_buffer *qbuf = MALLOC_STRUCT(si_query_buffer);
-		memcpy(qbuf, buffer, sizeof(*qbuf));
-		buffer->previous = qbuf;
+		/* Queries are normally read by the CPU after
+		 * being written by the gpu, hence staging is probably a good
+		 * usage pattern.
+		 */
+		struct si_screen *screen = sctx->screen;
+		unsigned buf_size = MAX2(size, screen->info.min_alloc_size);
+		buffer->buf = si_resource(
+			pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
+		if (unlikely(!buffer->buf))
+			return false;
+		unprepared = true;
 	}
 
-	buffer->results_end = 0;
-
-	/* Queries are normally read by the CPU after
-	 * being written by the gpu, hence staging is probably a good
-	 * usage pattern.
-	 */
-	struct si_screen *screen = sctx->screen;
-	unsigned buf_size = MAX2(size, screen->info.min_alloc_size);
-	buffer->buf = si_resource(
-		pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
-	if (unlikely(!buffer->buf))
-		return false;
-
-	if (prepare_buffer) {
+	if (unprepared && prepare_buffer) {
 		if (unlikely(!prepare_buffer(sctx, buffer))) {
 			si_resource_reference(&buffer->buf, NULL);
 			return false;
diff --git a/src/gallium/drivers/radeonsi/si_query.h b/src/gallium/drivers/radeonsi/si_query.h
index aaf0bd03aca..c61af51d57c 100644
--- a/src/gallium/drivers/radeonsi/si_query.h
+++ b/src/gallium/drivers/radeonsi/si_query.h
@@ -177,12 +177,13 @@ struct si_query_hw_ops {
 struct si_query_buffer {
 	/* The buffer where query results are stored. */
 	struct si_resource		*buf;
-	/* Offset of the next free result after current query data */
-	unsigned			results_end;
 	/* If a query buffer is full, a new buffer is created and the old one
 	 * is put in here. When we calculate the result, we sum up the samples
 	 * from all buffers. */
 	struct si_query_buffer	*previous;
+	/* Offset of the next free result after current query data */
+	unsigned			results_end;
+	bool unprepared;
 };
 
 void si_query_buffer_destroy(struct si_screen *sctx, struct si_query_buffer *buffer);
diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c
index 7554f5b9f8b..d7618b46eb0 100644
--- a/src/gallium/drivers/radeonsi/si_shader_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shader_nir.c
@@ -834,7 +834,7 @@ si_lower_nir(struct si_shader_selector* sel)
 		NIR_PASS(progress, sel->nir, nir_opt_if);
 		NIR_PASS(progress, sel->nir, nir_opt_dead_cf);
 		NIR_PASS(progress, sel->nir, nir_opt_cse);
-		NIR_PASS(progress, sel->nir, nir_opt_peephole_select, 8, true, true);
+		NIR_PASS(progress, sel->nir, nir_opt_peephole_select, 8, true);
 
 		/* Needed for algebraic lowering */
 		NIR_PASS(progress, sel->nir, nir_opt_algebraic);
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 89d81c97e18..85103a614b1 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -256,7 +256,7 @@ static void si_emit_cb_render_state(struct si_context *sctx)
 					    sx_blend_opt_control);
 	}
 	if (initial_cdw != cs->current.cdw)
-		sctx->context_roll_counter++;
+		sctx->context_roll = true;
 }
 
 /*
@@ -792,7 +792,7 @@ static void si_emit_clip_regs(struct si_context *sctx)
 		S_028810_CLIP_DISABLE(window_space));
 
 	if (initial_cdw != sctx->gfx_cs->current.cdw)
-		sctx->context_roll_counter++;
+		sctx->context_roll = true;
 }
 
 /*
@@ -1446,7 +1446,7 @@ static void si_emit_db_render_state(struct si_context *sctx)
 				   SI_TRACKED_DB_SHADER_CONTROL, db_shader_control);
 
 	if (initial_cdw != sctx->gfx_cs->current.cdw)
-		sctx->context_roll_counter++;
+		sctx->context_roll = true;
 }
 
 /*
@@ -3527,7 +3527,7 @@ static void si_emit_msaa_config(struct si_context *sctx)
 				   SI_TRACKED_PA_SC_MODE_CNTL_1, sc_mode_cntl_1);
 
 	if (initial_cdw != cs->current.cdw) {
-		sctx->context_roll_counter++;
+		sctx->context_roll = true;
 
 		/* GFX9: Flush DFSM when the AA mode changes. */
 		if (sctx->screen->dfsm_allowed) {
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 767e789276a..344f45e7e43 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -224,7 +224,8 @@ static inline unsigned si_atoms_that_always_roll_context(void)
 		SI_ATOM_BIT(scissors) |
 		SI_ATOM_BIT(viewports) |
 		SI_ATOM_BIT(stencil_ref) |
-		SI_ATOM_BIT(scratch_state));
+		SI_ATOM_BIT(scratch_state) |
+		SI_ATOM_BIT(window_rectangles));
 }
 
 struct si_shader_data {
diff --git a/src/gallium/drivers/radeonsi/si_state_binning.c b/src/gallium/drivers/radeonsi/si_state_binning.c
index 3516e561282..5c6c2e69b90 100644
--- a/src/gallium/drivers/radeonsi/si_state_binning.c
+++ b/src/gallium/drivers/radeonsi/si_state_binning.c
@@ -321,7 +321,7 @@ static void si_emit_dpbb_disable(struct si_context *sctx)
 				   S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF) |
 				   S_028060_POPS_DRAIN_PS_ON_OVERLAP(1));
 	if (initial_cdw != sctx->gfx_cs->current.cdw)
-		sctx->context_roll_counter++;
+		sctx->context_roll = true;
 }
 
 void si_emit_dpbb_state(struct si_context *sctx)
@@ -443,5 +443,5 @@ void si_emit_dpbb_state(struct si_context *sctx)
 				   S_028060_PUNCHOUT_MODE(punchout_mode) |
 				   S_028060_POPS_DRAIN_PS_ON_OVERLAP(1));
 	if (initial_cdw != sctx->gfx_cs->current.cdw)
-		sctx->context_roll_counter++;
+		sctx->context_roll = true;
 }
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index c7c02d20d15..7bf82b8b05b 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -66,7 +66,7 @@ static unsigned si_conv_pipe_prim(unsigned mode)
  * The information about LDS and other non-compile-time parameters is then
  * written to userdata SGPRs.
  */
-static bool si_emit_derived_tess_state(struct si_context *sctx,
+static void si_emit_derived_tess_state(struct si_context *sctx,
 				       const struct pipe_draw_info *info,
 				       unsigned *num_patches)
 {
@@ -110,7 +110,7 @@ static bool si_emit_derived_tess_state(struct si_context *sctx,
 	    (!has_primid_instancing_bug ||
 	     (sctx->last_tess_uses_primid == tess_uses_primid))) {
 		*num_patches = sctx->last_num_patches;
-		return false;
+		return;
 	}
 
 	sctx->last_ls = ls_current;
@@ -305,9 +305,8 @@ static bool si_emit_derived_tess_state(struct si_context *sctx,
 					       ls_hs_config);
 		}
 		sctx->last_ls_hs_config = ls_hs_config;
-		return true; /* true if the context rolls */
+		sctx->context_roll = true;
 	}
-	return false;
 }
 
 static unsigned si_num_prims_for_vertices(const struct pipe_draw_info *info)
@@ -541,7 +540,7 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
 }
 
 /* rast_prim is the primitive type after GS. */
-static bool si_emit_rasterizer_prim_state(struct si_context *sctx)
+static void si_emit_rasterizer_prim_state(struct si_context *sctx)
 {
 	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 	enum pipe_prim_type rast_prim = sctx->current_rast_prim;
@@ -549,11 +548,11 @@ static bool si_emit_rasterizer_prim_state(struct si_context *sctx)
 
 	/* Skip this if not rendering lines. */
 	if (!util_prim_is_lines(rast_prim))
-		return false;
+		return;
 
 	if (rast_prim == sctx->last_rast_prim &&
 	    rs->pa_sc_line_stipple == sctx->last_sc_line_stipple)
-		return false;
+		return;
 
 	/* For lines, reset the stipple pattern at each primitive. Otherwise,
 	 * reset the stipple pattern at each packet (line strips, line loops).
@@ -564,7 +563,7 @@ static bool si_emit_rasterizer_prim_state(struct si_context *sctx)
 
 	sctx->last_rast_prim = rast_prim;
 	sctx->last_sc_line_stipple = rs->pa_sc_line_stipple;
-	return true; /* true if the context rolls */
+	sctx->context_roll = true;
 }
 
 static void si_emit_vs_state(struct si_context *sctx,
@@ -659,6 +658,7 @@ static void si_emit_draw_registers(struct si_context *sctx,
 		radeon_set_context_reg(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX,
 				       info->restart_index);
 		sctx->last_restart_index = info->restart_index;
+		sctx->context_roll = true;
 	}
 }
 
@@ -896,6 +896,10 @@ static void si_emit_surface_sync(struct si_context *sctx,
 		radeon_emit(cs, 0);               /* CP_COHER_BASE */
 		radeon_emit(cs, 0x0000000A);      /* POLL_INTERVAL */
 	}
+
+	/* ACQUIRE_MEM has an implicit context roll if the current context
+	 * is busy. */
+	sctx->context_roll = true;
 }
 
 void si_emit_cache_flush(struct si_context *sctx)
@@ -1210,26 +1214,10 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i
 			       unsigned skip_atom_mask)
 {
 	unsigned num_patches = 0;
-	/* Vega10/Raven scissor bug workaround. When any context register is
-	 * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR
-	 * registers must be written too.
-	 */
-	bool handle_scissor_bug = (sctx->family == CHIP_VEGA10 || sctx->family == CHIP_RAVEN) &&
-				  !si_is_atom_dirty(sctx, &sctx->atoms.s.scissors);
-	bool context_roll = false; /* set correctly for GFX9 only */
 
-	context_roll |= si_emit_rasterizer_prim_state(sctx);
+	si_emit_rasterizer_prim_state(sctx);
 	if (sctx->tes_shader.cso)
-		context_roll |= si_emit_derived_tess_state(sctx, info, &num_patches);
-
-	if (handle_scissor_bug &&
-	    (info->count_from_stream_output ||
-	     sctx->dirty_atoms & si_atoms_that_always_roll_context() ||
-	     sctx->dirty_states & si_states_that_always_roll_context() ||
-	     si_prim_restart_index_changed(sctx, info)))
-		context_roll = true;
-
-	sctx->context_roll_counter = 0;
+		si_emit_derived_tess_state(sctx, info, &num_patches);
 
 	/* Emit state atoms. */
 	unsigned mask = sctx->dirty_atoms & ~skip_atom_mask;
@@ -1252,12 +1240,6 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i
 	}
 	sctx->dirty_states = 0;
 
-	if (handle_scissor_bug &&
-	    (context_roll || sctx->context_roll_counter)) {
-		sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
-		sctx->atoms.s.scissors.emit(sctx);
-	}
-
 	/* Emit draw states. */
 	si_emit_vs_state(sctx, info);
 	si_emit_draw_registers(sctx, info, num_patches);
@@ -1456,6 +1438,22 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
 	if (!si_upload_vertex_buffer_descriptors(sctx))
 		return;
 
+	/* Vega10/Raven scissor bug workaround. When any context register is
+	 * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR
+	 * registers must be written too.
+	 */
+	bool has_gfx9_scissor_bug = sctx->screen->has_gfx9_scissor_bug;
+	unsigned masked_atoms = 0;
+
+	if (has_gfx9_scissor_bug) {
+		masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.scissors);
+
+		if (info->count_from_stream_output ||
+		    sctx->dirty_atoms & si_atoms_that_always_roll_context() ||
+		    sctx->dirty_states & si_states_that_always_roll_context())
+			sctx->context_roll = true;
+	}
+
 	/* Use optimal packet order based on whether we need to sync the pipeline. */
 	if (unlikely(sctx->flags & (SI_CONTEXT_FLUSH_AND_INV_CB |
 				      SI_CONTEXT_FLUSH_AND_INV_DB |
@@ -1466,8 +1464,6 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
 		 * Then draw and prefetch at the end. This ensures that the time
 		 * the CUs are idle is very short.
 		 */
-		unsigned masked_atoms = 0;
-
 		if (unlikely(sctx->flags & SI_CONTEXT_FLUSH_FOR_RENDER_COND))
 			masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.render_cond);
 
@@ -1481,6 +1477,13 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
 
 		if (si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond))
 			sctx->atoms.s.render_cond.emit(sctx);
+
+		if (has_gfx9_scissor_bug &&
+		    (sctx->context_roll ||
+		     si_is_atom_dirty(sctx, &sctx->atoms.s.scissors))) {
+			sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
+			sctx->atoms.s.scissors.emit(sctx);
+		}
 		sctx->dirty_atoms = 0;
 
 		si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);
@@ -1505,7 +1508,16 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
 		if (!si_upload_graphics_shader_descriptors(sctx))
 			return;
 
-		si_emit_all_states(sctx, info, 0);
+		si_emit_all_states(sctx, info, masked_atoms);
+
+		if (has_gfx9_scissor_bug &&
+		    (sctx->context_roll ||
+		     si_is_atom_dirty(sctx, &sctx->atoms.s.scissors))) {
+			sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
+			sctx->atoms.s.scissors.emit(sctx);
+		}
+		sctx->dirty_atoms = 0;
+
 		si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);
 
 		/* Prefetch the remaining shaders after the draw has been
@@ -1514,6 +1526,9 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
 			cik_emit_prefetch_L2(sctx, false);
 	}
 
+	/* Clear the context roll flag after the draw call. */
+	sctx->context_roll = false;
+
 	if (unlikely(sctx->current_saved_cs)) {
 		si_trace_emit(sctx);
 		si_log_draw_state(sctx, sctx->log);
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 9e052e1efce..e76bb49dff8 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -576,7 +576,7 @@ static void si_emit_shader_es(struct si_context *sctx)
 					   shader->vgt_vertex_reuse_block_cntl);
 
 	if (initial_cdw != sctx->gfx_cs->current.cdw)
-		sctx->context_roll_counter++;
+		sctx->context_roll = true;
 }
 
 static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader)
@@ -825,7 +825,7 @@ static void si_emit_shader_gs(struct si_context *sctx)
 	}
 
 	if (initial_cdw != sctx->gfx_cs->current.cdw)
-		sctx->context_roll_counter++;
+		sctx->context_roll = true;
 }
 
 static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
@@ -1002,7 +1002,7 @@ static void si_emit_shader_vs(struct si_context *sctx)
 					   shader->vgt_vertex_reuse_block_cntl);
 
 	if (initial_cdw != sctx->gfx_cs->current.cdw)
-		sctx->context_roll_counter++;
+		sctx->context_roll = true;
 }
 
 /**
@@ -1194,7 +1194,7 @@ static void si_emit_shader_ps(struct si_context *sctx)
 				   shader->ctx_reg.ps.cb_shader_mask);
 
 	if (initial_cdw != sctx->gfx_cs->current.cdw)
-		sctx->context_roll_counter++;
+		sctx->context_roll = true;
 }
 
 static void si_shader_ps(struct si_shader *shader)
@@ -2869,7 +2869,7 @@ static void si_emit_spi_map(struct si_context *sctx)
 				    sctx->tracked_regs.spi_ps_input_cntl, num_interp);
 
 	if (initial_cdw != sctx->gfx_cs->current.cdw)
-		sctx->context_roll_counter++;
+		sctx->context_roll = true;
 }
 
 /**
diff --git a/src/gallium/drivers/radeonsi/si_state_streamout.c b/src/gallium/drivers/radeonsi/si_state_streamout.c
index 2bf6862c89b..2a0a4bef9a2 100644
--- a/src/gallium/drivers/radeonsi/si_state_streamout.c
+++ b/src/gallium/drivers/radeonsi/si_state_streamout.c
@@ -303,6 +303,7 @@ void si_emit_streamout_end(struct si_context *sctx)
 		 * buffer bound. This ensures that the primitives-emitted query
 		 * won't increment. */
 		radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 0);
+		sctx->context_roll = true;
 
 		t[i]->buf_filled_size_valid = true;
 	}
diff --git a/src/gallium/drivers/radeonsi/si_state_viewport.c b/src/gallium/drivers/radeonsi/si_state_viewport.c
index dac90df1c4f..1ec69216841 100644
--- a/src/gallium/drivers/radeonsi/si_state_viewport.c
+++ b/src/gallium/drivers/radeonsi/si_state_viewport.c
@@ -185,6 +185,16 @@ static void si_emit_guardband(struct si_context *ctx)
 	const unsigned hw_screen_offset_alignment =
 		ctx->chip_class >= VI ? 16 : MAX2(ctx->screen->se_tile_repeat, 16);
 
+	/* Indexed by quantization modes */
+	static int max_viewport_size[] = {65535, 16383, 4095};
+
+	/* Ensure that the whole viewport stays representable in
+	 * absolute coordinates.
+	 * See comment in si_set_viewport_states.
+	 */
+	assert(vp_as_scissor.maxx <= max_viewport_size[vp_as_scissor.quant_mode] &&
+	       vp_as_scissor.maxy <= max_viewport_size[vp_as_scissor.quant_mode]);
+
 	hw_screen_offset_x = CLAMP(hw_screen_offset_x, 0, MAX_PA_SU_HARDWARE_SCREEN_OFFSET);
 	hw_screen_offset_y = CLAMP(hw_screen_offset_y, 0, MAX_PA_SU_HARDWARE_SCREEN_OFFSET);
 
@@ -219,7 +229,6 @@ static void si_emit_guardband(struct si_context *ctx)
 	 *
 	 * The viewport range is [-max_viewport_size/2, max_viewport_size/2].
 	 */
-	static unsigned max_viewport_size[] = {65535, 16383, 4095};
 	assert(vp_as_scissor.quant_mode < ARRAY_SIZE(max_viewport_size));
 	max_range = max_viewport_size[vp_as_scissor.quant_mode] / 2;
 	left   = (-max_range - vp.translate[0]) / vp.scale[0];
@@ -274,7 +283,7 @@ static void si_emit_guardband(struct si_context *ctx)
 				   S_028BE4_QUANT_MODE(V_028BE4_X_16_8_FIXED_POINT_1_256TH +
 						       vp_as_scissor.quant_mode));
 	if (initial_cdw != ctx->gfx_cs->current.cdw)
-		ctx->context_roll_counter++;
+		ctx->context_roll = true;
 }
 
 static void si_emit_scissors(struct si_context *ctx)
@@ -333,6 +342,8 @@ static void si_set_viewport_states(struct pipe_context *pctx,
 		unsigned h = scissor->maxy - scissor->miny;
 		unsigned max_extent = MAX2(w, h);
 
+		int max_corner = MAX2(scissor->maxx, scissor->maxy);
+
 		unsigned center_x = (scissor->maxx + scissor->minx) / 2;
 		unsigned center_y = (scissor->maxy + scissor->miny) / 2;
 		unsigned max_center = MAX2(center_x, center_y);
@@ -358,7 +369,22 @@ static void si_set_viewport_states(struct pipe_context *pctx,
 		if (ctx->family == CHIP_RAVEN)
 			max_extent = 16384; /* Use QUANT_MODE == 16_8. */
 
-		if (max_extent <= 1024) /* 4K scanline area for guardband */
+		/* Another constraint is that all coordinates in the viewport
+		 * are representable in fixed point with respect to the
+		 * surface origin.
+		 *
+		 * It means that PA_SU_HARDWARE_SCREEN_OFFSET can't be given
+		 * an offset that would make the upper corner of the viewport
+		 * greater than the maximum representable number post
+		 * quantization, ie 2^quant_bits.
+		 *
+		 * This does not matter for 14.10 and 16.8 formats since the
+		 * offset is already limited at 8k, but it means we can't use
+		 * 12.12 if we are drawing to some pixels outside the lower
+		 * 4k x 4k of the render target.
+		 */
+
+		if (max_extent <= 1024 && max_corner < 4096) /* 4K scanline area for guardband */
 			scissor->quant_mode = SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH;
 		else if (max_extent <= 4096) /* 16K scanline area for guardband */
 			scissor->quant_mode = SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH;
diff --git a/src/gallium/drivers/radeonsi/si_test_dma.c b/src/gallium/drivers/radeonsi/si_test_dma.c
index 90a2032cd80..7e396e671be 100644
--- a/src/gallium/drivers/radeonsi/si_test_dma.c
+++ b/src/gallium/drivers/radeonsi/si_test_dma.c
@@ -309,7 +309,7 @@ void si_test_dma(struct si_screen *sscreen)
 		/* clear dst pixels */
 		uint32_t zero = 0;
 		si_clear_buffer(sctx, dst, 0, sdst->surface.surf_size, &zero, 4,
-		                SI_COHERENCY_SHADER);
+		                SI_COHERENCY_SHADER, false);
 		memset(dst_cpu.ptr, 0, dst_cpu.layer_stride * tdst.array_size);
 
 		/* preparation */
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index 44e48cc7ee4..6931b52dc9f 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -265,6 +265,8 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
       return 1;
    case PIPE_CAP_CLEAR_TEXTURE:
       return 1;
+   case PIPE_CAP_MAX_VARYINGS:
+      return TGSI_EXEC_MAX_INPUT_ATTRIBS;
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
diff --git a/src/gallium/drivers/softpipe/sp_setup.c b/src/gallium/drivers/softpipe/sp_setup.c
index ffe49260b9a..a91e4f588c8 100644
--- a/src/gallium/drivers/softpipe/sp_setup.c
+++ b/src/gallium/drivers/softpipe/sp_setup.c
@@ -390,17 +390,6 @@ setup_sort_vertices(struct setup_context *setup,
 	 return FALSE;
    }
 
-
-   /* Prepare pixel offset for rasterisation:
-    *  - pixel center (0.5, 0.5) for GL, or
-    *  - assume (0.0, 0.0) for other APIs.
-    */
-   if (setup->softpipe->rasterizer->half_pixel_center) {
-      setup->pixel_offset = 0.5f;
-   } else {
-      setup->pixel_offset = 0.0f;
-   }
-
    return TRUE;
 }
 
@@ -1476,6 +1465,16 @@ sp_setup_prepare(struct setup_context *setup)
       }
    }
 
+   /* Prepare pixel offset for rasterisation:
+    *  - pixel center (0.5, 0.5) for GL, or
+    *  - assume (0.0, 0.0) for other APIs.
+    */
+   if (setup->softpipe->rasterizer->half_pixel_center) {
+      setup->pixel_offset = 0.5f;
+   } else {
+      setup->pixel_offset = 0.0f;
+   }
+
    setup->max_layer = max_layer;
 
    sp->quad.first->begin( sp->quad.first );
diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.c b/src/gallium/drivers/softpipe/sp_tile_cache.c
index 351736ee421..998939bdf30 100644
--- a/src/gallium/drivers/softpipe/sp_tile_cache.c
+++ b/src/gallium/drivers/softpipe/sp_tile_cache.c
@@ -373,17 +373,18 @@ sp_tile_cache_flush_clear(struct softpipe_tile_cache *tc, int layer)
                if (util_format_is_pure_uint(tc->surface->format)) {
                   pipe_put_tile_ui_format(pt, tc->transfer_map[layer],
                                           x, y, TILE_SIZE, TILE_SIZE,
-                                          pt->resource->format,
+                                          tc->surface->format,
                                           (unsigned *) tc->tile->data.colorui128);
                } else if (util_format_is_pure_sint(tc->surface->format)) {
                   pipe_put_tile_i_format(pt, tc->transfer_map[layer],
                                          x, y, TILE_SIZE, TILE_SIZE,
-                                         pt->resource->format,
+                                         tc->surface->format,
                                          (int *) tc->tile->data.colori128);
                } else {
-                  pipe_put_tile_rgba(pt, tc->transfer_map[layer],
-                                     x, y, TILE_SIZE, TILE_SIZE,
-                                     (float *) tc->tile->data.color);
+                  pipe_put_tile_rgba_format(pt, tc->transfer_map[layer],
+                                            x, y, TILE_SIZE, TILE_SIZE,
+                                            tc->surface->format,
+                                            (float *) tc->tile->data.color);
                }
             }
             numCleared++;
diff --git a/src/gallium/drivers/svga/Makefile.sources b/src/gallium/drivers/svga/Makefile.sources
index 72024cf60e1..229d2863c84 100644
--- a/src/gallium/drivers/svga/Makefile.sources
+++ b/src/gallium/drivers/svga/Makefile.sources
@@ -15,8 +15,6 @@ C_SOURCES := \
 	svga_hw_reg.h \
 	svga_link.c \
 	svga_link.h \
-	svga_msg.c \
-	svga_msg.h \
 	svga_mksstats.h \
 	svga_pipe_blend.c \
 	svga_pipe_blit.c \
diff --git a/src/gallium/drivers/svga/meson.build b/src/gallium/drivers/svga/meson.build
index 7981e2991f3..4d3207a9942 100644
--- a/src/gallium/drivers/svga/meson.build
+++ b/src/gallium/drivers/svga/meson.build
@@ -27,7 +27,6 @@ files_svga = files(
   'svga_draw_elements.c',
   'svga_format.c',
   'svga_link.c',
-  'svga_msg.c',
   'svga_pipe_blend.c',
   'svga_pipe_blit.c',
   'svga_pipe_clear.c',
diff --git a/src/gallium/drivers/svga/svga_cmd.c b/src/gallium/drivers/svga/svga_cmd.c
index 5557d208171..6577c839cf0 100644
--- a/src/gallium/drivers/svga/svga_cmd.c
+++ b/src/gallium/drivers/svga/svga_cmd.c
@@ -1693,7 +1693,7 @@ SVGA3D_BindGBSurface(struct svga_winsys_context *swc,
       return PIPE_ERROR_OUT_OF_MEMORY;
 
    swc->surface_relocation(swc, &cmd->sid, &cmd->mobid, surface,
-                           SVGA_RELOC_READ | SVGA_RELOC_INTERNAL);
+                           SVGA_RELOC_READ);
 
    swc->commit(swc);
 
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index 95dde8b0897..f747ff78bcf 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -37,7 +37,6 @@
 #include "svga_public.h"
 #include "svga_context.h"
 #include "svga_format.h"
-#include "svga_msg.h"
 #include "svga_screen.h"
 #include "svga_tgsi.h"
 #include "svga_resource_texture.h"
@@ -350,6 +349,8 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
 
    case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
       return sws->have_sm4_1 ? 1 : 0; /* only single-channel textures */
+   case PIPE_CAP_MAX_VARYINGS:
+      return sws->have_vgpu10 ? VGPU10_MAX_FS_INPUTS : 10;
 
    /* Unsupported features */
    case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
@@ -889,17 +890,18 @@ svga_get_driver_query_info(struct pipe_screen *screen,
 static void
 init_logging(struct pipe_screen *screen)
 {
+   struct svga_screen *svgascreen = svga_screen(screen);
    static const char *log_prefix = "Mesa: ";
    char host_log[1000];
 
    /* Log Version to Host */
    util_snprintf(host_log, sizeof(host_log) - strlen(log_prefix),
-                 "%s%s", log_prefix, svga_get_name(screen));
-   svga_host_log(host_log);
+                 "%s%s\n", log_prefix, svga_get_name(screen));
+   svgascreen->sws->host_log(svgascreen->sws, host_log);
 
    util_snprintf(host_log, sizeof(host_log) - strlen(log_prefix),
                  "%s" PACKAGE_VERSION MESA_GIT_SHA1, log_prefix);
-   svga_host_log(host_log);
+   svgascreen->sws->host_log(svgascreen->sws, host_log);
 
    /* If the SVGA_EXTRA_LOGGING env var is set, log the process's command
     * line (program name and arguments).
@@ -908,13 +910,23 @@ init_logging(struct pipe_screen *screen)
       char cmdline[1000];
       if (os_get_command_line(cmdline, sizeof(cmdline))) {
          util_snprintf(host_log, sizeof(host_log) - strlen(log_prefix),
-                       "%s%s", log_prefix, cmdline);
-         svga_host_log(host_log);
+                       "%s%s\n", log_prefix, cmdline);
+         svgascreen->sws->host_log(svgascreen->sws, host_log);
       }
    }
 }
 
 
+/**
+ * no-op logging function to use when SVGA_NO_LOGGING is set.
+ */
+static void
+nop_host_log(struct svga_winsys_screen *sws, const char *message)
+{
+   /* nothing */
+}
+
+
 static void
 svga_destroy_screen( struct pipe_screen *screen )
 {
@@ -1132,7 +1144,11 @@ svga_screen_create(struct svga_winsys_screen *sws)
 
    svga_screen_cache_init(svgascreen);
 
-   init_logging(screen);
+   if (debug_get_bool_option("SVGA_NO_LOGGING", FALSE) == TRUE) {
+      svgascreen->sws->host_log = nop_host_log;
+   } else {
+      init_logging(screen);
+   }
 
    return screen;
 error2:
diff --git a/src/gallium/drivers/svga/svga_winsys.h b/src/gallium/drivers/svga/svga_winsys.h
index a067a7ba09d..14782e19a7d 100644
--- a/src/gallium/drivers/svga/svga_winsys.h
+++ b/src/gallium/drivers/svga/svga_winsys.h
@@ -753,6 +753,11 @@ struct svga_winsys_screen
    void
    (*stats_time_pop)();
 
+   /**
+    * Send a host log message
+    */
+   void
+   (*host_log)(struct svga_winsys_screen *sws, const char *message);
 
    /** Have VGPU v10 hardware? */
    boolean have_vgpu10;
diff --git a/src/gallium/drivers/swr/swr_screen.cpp b/src/gallium/drivers/swr/swr_screen.cpp
index de9008ddf6a..bee011e4abf 100644
--- a/src/gallium/drivers/swr/swr_screen.cpp
+++ b/src/gallium/drivers/swr/swr_screen.cpp
@@ -369,6 +369,8 @@ swr_get_param(struct pipe_screen *screen, enum pipe_cap param)
       return 32;
    case PIPE_CAP_MAX_SHADER_BUFFER_SIZE:
       return 1 << 27;
+   case PIPE_CAP_MAX_VARYINGS:
+      return 32;
 
    case PIPE_CAP_VENDOR_ID:
       return 0xFFFFFFFF;
@@ -844,7 +846,9 @@ swr_texture_layout(struct swr_screen *screen,
 
    size_t total_size = (uint64_t)res->swr.depth * res->swr.qpitch *
                                  res->swr.pitch * res->swr.numSamples;
-   if (total_size > SWR_MAX_TEXTURE_SIZE)
+
+   // Let non-sampled textures (e.g. buffer objects) bypass the size limit
+   if (swr_resource_is_texture(&res->base) && total_size > SWR_MAX_TEXTURE_SIZE)
       return false;
 
    if (allocate) {
diff --git a/src/gallium/drivers/v3d/v3d_blit.c b/src/gallium/drivers/v3d/v3d_blit.c
index 2f36bdd46e3..d42e8fd0e69 100644
--- a/src/gallium/drivers/v3d/v3d_blit.c
+++ b/src/gallium/drivers/v3d/v3d_blit.c
@@ -491,7 +491,8 @@ v3d_tfu_blit(struct pipe_context *pctx, const struct pipe_blit_info *info)
         if ((info->mask & PIPE_MASK_RGBA) == 0)
                 return false;
 
-        if (info->dst.box.x != 0 ||
+        if (info->scissor_enable ||
+            info->dst.box.x != 0 ||
             info->dst.box.y != 0 ||
             info->dst.box.width != dst_width ||
             info->dst.box.height != dst_height ||
diff --git a/src/gallium/drivers/v3d/v3d_resource.c b/src/gallium/drivers/v3d/v3d_resource.c
index 21c68942e14..84e86799d5e 100644
--- a/src/gallium/drivers/v3d/v3d_resource.c
+++ b/src/gallium/drivers/v3d/v3d_resource.c
@@ -780,7 +780,7 @@ v3d_resource_create_with_modifiers(struct pipe_screen *pscreen,
                 rsc->tiled = false;
         } else {
                 fprintf(stderr, "Unsupported modifier requested\n");
-                return NULL;
+                goto fail;
         }
 
         rsc->internal_format = prsc->format;
diff --git a/src/gallium/drivers/v3d/v3d_screen.c b/src/gallium/drivers/v3d/v3d_screen.c
index bed2c63a64d..17afeebb4fc 100644
--- a/src/gallium/drivers/v3d/v3d_screen.c
+++ b/src/gallium/drivers/v3d/v3d_screen.c
@@ -70,6 +70,7 @@ v3d_screen_destroy(struct pipe_screen *pscreen)
         util_hash_table_destroy(screen->bo_handles);
         v3d_bufmgr_destroy(pscreen);
         slab_destroy_parent(&screen->transfer_pool);
+        free(screen->ro);
 
         if (using_v3d_simulator)
                 v3d_simulator_destroy(screen);
@@ -177,11 +178,17 @@ v3d_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
                 return 4;
 
+        case PIPE_CAP_MAX_VARYINGS:
+                return V3D_MAX_FS_INPUTS / 4;
+
                 /* Texturing. */
         case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
         case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
         case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
-                return V3D_MAX_MIP_LEVELS;
+                if (screen->devinfo.ver < 40)
+                        return 12;
+                else
+                        return V3D_MAX_MIP_LEVELS;
         case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
                 return 2048;
 
diff --git a/src/gallium/drivers/v3d/v3dx_draw.c b/src/gallium/drivers/v3d/v3dx_draw.c
index 2700208e388..4b1b03b5db5 100644
--- a/src/gallium/drivers/v3d/v3dx_draw.c
+++ b/src/gallium/drivers/v3d/v3dx_draw.c
@@ -55,7 +55,28 @@ v3d_start_draw(struct v3d_context *v3d)
         job->submit.bcl_start = job->bcl.bo->offset;
         v3d_job_add_bo(job, job->bcl.bo);
 
-        job->tile_alloc = v3d_bo_alloc(v3d->screen, 1024 * 1024, "tile_alloc");
+        /* The PTB will request the tile alloc initial size per tile at start
+         * of tile binning.
+         */
+        uint32_t tile_alloc_size = (job->draw_tiles_x *
+                                    job->draw_tiles_y) * 64;
+        /* The PTB allocates in aligned 4k chunks after the initial setup. */
+        tile_alloc_size = align(tile_alloc_size, 4096);
+
+        /* Include the first two chunk allocations that the PTB does so that
+         * we definitely clear the OOM condition before triggering one (the HW
+         * won't trigger OOM during the first allocations).
+         */
+        tile_alloc_size += 8192;
+
+        /* For performance, allocate some extra initial memory after the PTB's
+         * minimal allocations, so that we hopefully don't have to block the
+         * GPU on the kernel handling an OOM signal.
+         */
+        tile_alloc_size += 512 * 1024;
+
+        job->tile_alloc = v3d_bo_alloc(v3d->screen, tile_alloc_size,
+                                       "tile_alloc");
         uint32_t tsda_per_tile_size = v3d->screen->devinfo.ver >= 40 ? 256 : 64;
         job->tile_state = v3d_bo_alloc(v3d->screen,
                                        job->draw_tiles_y *
@@ -203,8 +224,13 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d,
                  * shader needs to write the Z value (even just discards).
                  */
                 shader.fragment_shader_does_z_writes =
-                        (v3d->prog.fs->prog_data.fs->writes_z ||
-                         v3d->prog.fs->prog_data.fs->discard);
+                        v3d->prog.fs->prog_data.fs->writes_z;
+                /* Set if the EZ test must be disabled (due to shader side
+                 * effects and the early_z flag not being present in the
+                 * shader).
+                 */
+                shader.turn_off_early_z_test =
+                        v3d->prog.fs->prog_data.fs->disable_ez;
 
                 shader.fragment_shader_uses_real_pixel_centre_w_in_addition_to_centroid_w2 =
                         v3d->prog.fs->prog_data.fs->uses_center_w;
diff --git a/src/gallium/drivers/v3d/v3dx_state.c b/src/gallium/drivers/v3d/v3dx_state.c
index f326b5379ba..eff6bcfca06 100644
--- a/src/gallium/drivers/v3d/v3dx_state.c
+++ b/src/gallium/drivers/v3d/v3dx_state.c
@@ -846,6 +846,9 @@ v3d_setup_texture_shader_state(struct V3DX(TEXTURE_SHADER_STATE) *tex,
             prsc->target == PIPE_TEXTURE_1D_ARRAY) {
                 tex->image_height = tex->image_width >> 14;
         }
+
+        tex->image_width &= (1 << 14) - 1;
+        tex->image_height &= (1 << 14) - 1;
 #endif
 
         if (prsc->target == PIPE_TEXTURE_3D) {
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index 2d0a52bb5fb..8f1e561c444 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -1591,7 +1591,7 @@ vc4_optimize_nir(struct nir_shader *s)
                 NIR_PASS(progress, s, nir_opt_dce);
                 NIR_PASS(progress, s, nir_opt_dead_cf);
                 NIR_PASS(progress, s, nir_opt_cse);
-                NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);
+                NIR_PASS(progress, s, nir_opt_peephole_select, 8, true);
                 NIR_PASS(progress, s, nir_opt_algebraic);
                 NIR_PASS(progress, s, nir_opt_constant_folding);
                 NIR_PASS(progress, s, nir_opt_undef);
diff --git a/src/gallium/drivers/vc4/vc4_query.c b/src/gallium/drivers/vc4/vc4_query.c
index 6e4681e93cc..f08785f457f 100644
--- a/src/gallium/drivers/vc4/vc4_query.c
+++ b/src/gallium/drivers/vc4/vc4_query.c
@@ -132,7 +132,7 @@ vc4_create_batch_query(struct pipe_context *pctx, unsigned num_queries,
 
         /* We can't mix HW and non-HW queries. */
         if (nhwqueries && nhwqueries != num_queries)
-                return NULL;
+                goto err_free_query;
 
         if (!nhwqueries)
                 return (struct pipe_query *)query;
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index e7f7c82c271..acb4a1feb0d 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -178,6 +178,9 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
                 /* Note: Not supported in hardware, just faking it. */
                 return 5;
 
+        case PIPE_CAP_MAX_VARYINGS:
+                return 8;
+
         case PIPE_CAP_VENDOR_ID:
                 return 0x14E4;
         case PIPE_CAP_ACCELERATED:
diff --git a/src/gallium/drivers/virgl/virgl_context.c b/src/gallium/drivers/virgl/virgl_context.c
index 08f85f8574a..f9d8e231a13 100644
--- a/src/gallium/drivers/virgl/virgl_context.c
+++ b/src/gallium/drivers/virgl/virgl_context.c
@@ -765,7 +765,6 @@ static void virgl_flush_from_st(struct pipe_context *ctx,
                                enum pipe_flush_flags flags)
 {
    struct virgl_context *vctx = virgl_context(ctx);
-   struct virgl_screen *rs = virgl_screen(ctx->screen);
 
    if (flags & PIPE_FLUSH_FENCE_FD)
        vctx->cbuf->needs_out_fence_fd = true;
diff --git a/src/gallium/drivers/virgl/virgl_screen.c b/src/gallium/drivers/virgl/virgl_screen.c
index 42e0987e0c9..17fa5fc51cc 100644
--- a/src/gallium/drivers/virgl/virgl_screen.c
+++ b/src/gallium/drivers/virgl/virgl_screen.c
@@ -258,6 +258,10 @@ virgl_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
    case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
       return 1; /* TODO: need to introduce a hw-cap for this */
+   case PIPE_CAP_MAX_VARYINGS:
+      if (vscreen->caps.caps.v1.glsl_level < 150)
+         return vscreen->caps.caps.v2.max_vertex_attribs;
+      return 32;
    case PIPE_CAP_TEXTURE_GATHER_SM5:
    case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
    case PIPE_CAP_FAKE_SW_MSAA:
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index 867d0cb5d74..96e8fbed1be 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -856,6 +856,7 @@ enum pipe_cap
    PIPE_CAP_QUERY_PIPELINE_STATISTICS_SINGLE,
    PIPE_CAP_RGB_OVERRIDE_DST_ALPHA_BLEND,
    PIPE_CAP_DEST_SURFACE_SRGB_CONTROL,
+   PIPE_CAP_MAX_VARYINGS,
 };
 
 /**
diff --git a/src/gallium/include/pipe/p_video_enums.h b/src/gallium/include/pipe/p_video_enums.h
index b5b8b062285..adbe7858d0f 100644
--- a/src/gallium/include/pipe/p_video_enums.h
+++ b/src/gallium/include/pipe/p_video_enums.h
@@ -70,7 +70,8 @@ enum pipe_video_profile
    PIPE_VIDEO_PROFILE_HEVC_MAIN_444,
    PIPE_VIDEO_PROFILE_JPEG_BASELINE,
    PIPE_VIDEO_PROFILE_VP9_PROFILE0,
-   PIPE_VIDEO_PROFILE_VP9_PROFILE2
+   PIPE_VIDEO_PROFILE_VP9_PROFILE2,
+   PIPE_VIDEO_PROFILE_MAX
 };
 
 /* Video caps, can be different for each codec/profile */
diff --git a/src/gallium/state_trackers/dri/dri2.c b/src/gallium/state_trackers/dri/dri2.c
index ebbbabb6492..930d440a1e2 100644
--- a/src/gallium/state_trackers/dri/dri2.c
+++ b/src/gallium/state_trackers/dri/dri2.c
@@ -994,11 +994,6 @@ dri2_create_image_common(__DRIscreen *_screen,
    if (!map)
       return NULL;
 
-   /* createImageWithModifiers doesn't supply usage, and we should not get
-    * here with both modifiers and a usage flag.
-    */
-   assert(!(use && (modifiers != NULL)));
-
    tex_usage = PIPE_BIND_RENDER_TARGET | PIPE_BIND_SAMPLER_VIEW;
 
    if (use & __DRI_IMAGE_USE_SCANOUT)
@@ -1071,7 +1066,7 @@ dri2_create_image_with_modifiers(__DRIscreen *dri_screen,
                                  void *loaderPrivate)
 {
    return dri2_create_image_common(dri_screen, width, height, format,
-                                   0 /* use */, modifiers, count,
+                                   __DRI_IMAGE_USE_SHARE, modifiers, count,
                                    loaderPrivate);
 }
 
diff --git a/src/gallium/state_trackers/glx/xlib/meson.build b/src/gallium/state_trackers/glx/xlib/meson.build
index f4ee75426bc..34b93c94cf2 100644
--- a/src/gallium/state_trackers/glx/xlib/meson.build
+++ b/src/gallium/state_trackers/glx/xlib/meson.build
@@ -23,5 +23,5 @@ libxlib = static_library(
   files('glx_api.c', 'glx_getproc.c', 'glx_usefont.c', 'xm_api.c', 'xm_st.c'),
   c_args : c_vis_args,
   include_directories : [inc_common, inc_mapi, inc_mesa],
-  dependencies : [dep_x11, dep_xext, dep_xcb],
+  dependencies : [dep_x11, dep_xext, dep_xcb, dep_glproto],
 )
diff --git a/src/gallium/state_trackers/nine/nine_pipe.h b/src/gallium/state_trackers/nine/nine_pipe.h
index 7b68c09c47a..0595da5535a 100644
--- a/src/gallium/state_trackers/nine/nine_pipe.h
+++ b/src/gallium/state_trackers/nine/nine_pipe.h
@@ -377,6 +377,10 @@ d3dmultisample_type_check(struct pipe_screen *screen,
     if (levels)
         *levels = 1;
 
+    /* Ignores multisamplequality */
+    if (*multisample == D3DMULTISAMPLE_NONE)
+        return D3D_OK;
+
     if (*multisample == D3DMULTISAMPLE_NONMASKABLE) {
         if (depth_stencil_format(format))
             bind = d3d9_get_pipe_depth_format_bindings(format);
diff --git a/src/gallium/state_trackers/nine/swapchain9.c b/src/gallium/state_trackers/nine/swapchain9.c
index 6c22be24c7c..8026ee16b7a 100644
--- a/src/gallium/state_trackers/nine/swapchain9.c
+++ b/src/gallium/state_trackers/nine/swapchain9.c
@@ -133,6 +133,13 @@ D3DWindowBuffer_release(struct NineSwapChain9 *This,
                         D3DWindowBuffer *present_handle)
 {
     int i;
+
+    /* IsBufferReleased API not available */
+    if (This->base.device->minor_version_num <= 2) {
+        ID3DPresent_DestroyD3DWindowBuffer(This->present, present_handle);
+        return;
+    }
+
     /* Add it to the 'pending release' list */
     for (i = 0; i < D3DPRESENT_BACK_BUFFERS_MAX_EX + 1; i++) {
         if (!This->present_handles_pending_release[i]) {
@@ -750,9 +757,19 @@ present( struct NineSwapChain9 *This,
     if (This->params.SwapEffect == D3DSWAPEFFECT_DISCARD)
         handle_draw_cursor_and_hud(This, resource);
 
-    ID3DPresent_GetWindowInfo(This->present, hDestWindowOverride, &target_width, &target_height, &target_depth);
+    hr = ID3DPresent_GetWindowInfo(This->present, hDestWindowOverride, &target_width, &target_height, &target_depth);
     (void)target_depth;
 
+    /* Can happen with old Wine (presentation can still succeed),
+     * or at window destruction.
+     * Also disable for very old wine as D3DWindowBuffer_release
+     * cannot do the DestroyD3DWindowBuffer workaround. */
+    if (FAILED(hr) || target_width == 0 || target_height == 0 ||
+        This->base.device->minor_version_num <= 2) {
+        target_width = resource->width0;
+        target_height = resource->height0;
+    }
+
     /* Switch to using presentation buffers on window resize.
      * Note: Most apps should resize the d3d back buffers when
      * a window resize is detected, which will result in a call to
diff --git a/src/gallium/state_trackers/va/context.c b/src/gallium/state_trackers/va/context.c
index 14e904ee490..47a5e7be230 100644
--- a/src/gallium/state_trackers/va/context.c
+++ b/src/gallium/state_trackers/va/context.c
@@ -175,7 +175,7 @@ VA_DRIVER_INIT_FUNC(VADriverContextP ctx)
    ctx->version_minor = 1;
    *ctx->vtable = vtable;
    *ctx->vtable_vpp = vtable_vpp;
-   ctx->max_profiles = PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH - PIPE_VIDEO_PROFILE_UNKNOWN;
+   ctx->max_profiles = PIPE_VIDEO_PROFILE_MAX - PIPE_VIDEO_PROFILE_UNKNOWN - 1;
    ctx->max_entrypoints = 2;
    ctx->max_attributes = 1;
    ctx->max_image_formats = VL_VA_MAX_IMAGE_FORMATS;
diff --git a/src/gallium/state_trackers/va/picture_mpeg12.c b/src/gallium/state_trackers/va/picture_mpeg12.c
index 1e5a9c7428d..daf95f7403c 100644
--- a/src/gallium/state_trackers/va/picture_mpeg12.c
+++ b/src/gallium/state_trackers/va/picture_mpeg12.c
@@ -27,6 +27,19 @@
 
 #include "va_private.h"
 
+const int reverse_inverse_zscan[] =
+{
+   /* Reverse inverse z scan pattern */
+    0,  2,  3,  9, 10, 20, 21, 35,
+    1,  4,  8, 11, 19, 22, 34, 36,
+    5,  7, 12, 18, 23, 33, 37, 48,
+    6, 13, 17, 24, 32, 38, 47, 49,
+   14, 16, 25, 31, 39, 46, 50, 57,
+   15, 26, 30, 40, 45, 51, 56, 58,
+   27, 29, 41, 44, 52, 55, 59, 62,
+   28, 42, 43, 53, 54, 60, 61, 63,
+};
+
 void vlVaHandlePictureParameterBufferMPEG12(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf)
 {
    VAPictureParameterBufferMPEG2 *mpeg2 = buf->data;
@@ -66,16 +79,29 @@ void vlVaHandlePictureParameterBufferMPEG12(vlVaDriver *drv, vlVaContext *contex
 void vlVaHandleIQMatrixBufferMPEG12(vlVaContext *context, vlVaBuffer *buf)
 {
    VAIQMatrixBufferMPEG2 *mpeg2 = buf->data;
+   static uint8_t temp_intra_matrix[64];
+   static uint8_t temp_nonintra_matrix[64];
 
    assert(buf->size >= sizeof(VAIQMatrixBufferMPEG2) && buf->num_elements == 1);
-   if (mpeg2->load_intra_quantiser_matrix)
-      context->desc.mpeg12.intra_matrix = mpeg2->intra_quantiser_matrix;
-   else
+   if (mpeg2->load_intra_quantiser_matrix) {
+      /* The quantiser matrix that VAAPI provides has been applied
+         with inverse z-scan. However, what we expect in MPEG2
+         picture description is the original order. Therefore,
+         we need to reverse it back to its original order.
+      */
+      for (int i = 0; i < 64; i++)
+         temp_intra_matrix[i] =
+            mpeg2->intra_quantiser_matrix[reverse_inverse_zscan[i]];
+      context->desc.mpeg12.intra_matrix = temp_intra_matrix;
+   } else
       context->desc.mpeg12.intra_matrix = NULL;
 
-   if (mpeg2->load_non_intra_quantiser_matrix)
-      context->desc.mpeg12.non_intra_matrix = mpeg2->non_intra_quantiser_matrix;
-   else
+   if (mpeg2->load_non_intra_quantiser_matrix) {
+      for (int i = 0; i < 64; i++)
+         temp_nonintra_matrix[i] =
+            mpeg2->non_intra_quantiser_matrix[reverse_inverse_zscan[i]];
+      context->desc.mpeg12.non_intra_matrix = temp_nonintra_matrix;
+   } else
       context->desc.mpeg12.non_intra_matrix = NULL;
 }
 
diff --git a/src/gallium/state_trackers/va/picture_vp9.c b/src/gallium/state_trackers/va/picture_vp9.c
index c1ca54cd008..b5aca9a513c 100644
--- a/src/gallium/state_trackers/va/picture_vp9.c
+++ b/src/gallium/state_trackers/va/picture_vp9.c
@@ -28,6 +28,8 @@
 #include "vl/vl_vlc.h"
 #include "va_private.h"
 
+#define NUM_VP9_REFS 8
+
 void vlVaHandlePictureParameterBufferVP9(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf)
 {
    VADecPictureParameterBufferVP9 *vp9 = buf->data;
@@ -79,8 +81,11 @@ void vlVaHandlePictureParameterBufferVP9(vlVaDriver *drv, vlVaContext *context,
 
    context->desc.vp9.picture_parameter.bit_depth = vp9->bit_depth;
 
-   for (i = 0 ; i < 8 ; i++)
+   for (i = 0 ; i < NUM_VP9_REFS ; i++)
       vlVaGetReferenceFrame(drv, vp9->reference_frames[i], &context->desc.vp9.ref[i]);
+
+   if (!context->decoder && !context->templat.max_references)
+      context->templat.max_references = NUM_VP9_REFS;
 }
 
 void vlVaHandleSliceParameterBufferVP9(vlVaContext *context, vlVaBuffer *buf)
diff --git a/src/gallium/state_trackers/xvmc/attributes.c b/src/gallium/state_trackers/xvmc/attributes.c
index 375705669b0..6e4d78a9a29 100644
--- a/src/gallium/state_trackers/xvmc/attributes.c
+++ b/src/gallium/state_trackers/xvmc/attributes.c
@@ -90,15 +90,15 @@ Status XvMCSetAttribute(Display *dpy, XvMCContext *context, Atom attribute, int
    if (!attr)
       return XvMCBadContext;
 
-   if (strcmp(attr, XV_BRIGHTNESS))
+   if (strcmp(attr, XV_BRIGHTNESS) == 0)
       context_priv->procamp.brightness = value / 1000.0f;
-   else if (strcmp(attr, XV_CONTRAST))
+   else if (strcmp(attr, XV_CONTRAST) == 0)
       context_priv->procamp.contrast = value / 1000.0f + 1.0f;
-   else if (strcmp(attr, XV_SATURATION))
+   else if (strcmp(attr, XV_SATURATION) == 0)
       context_priv->procamp.saturation = value / 1000.0f + 1.0f;
-   else if (strcmp(attr, XV_HUE))
+   else if (strcmp(attr, XV_HUE) == 0)
       context_priv->procamp.hue = value / 1000.0f;
-   else if (strcmp(attr, XV_COLORSPACE))
+   else if (strcmp(attr, XV_COLORSPACE) == 0)
       context_priv->color_standard = value ?
          VL_CSC_COLOR_STANDARD_BT_601 :
          VL_CSC_COLOR_STANDARD_BT_709;
@@ -134,15 +134,15 @@ Status XvMCGetAttribute(Display *dpy, XvMCContext *context, Atom attribute, int
    if (!attr)
       return XvMCBadContext;
 
-   if (strcmp(attr, XV_BRIGHTNESS))
+   if (strcmp(attr, XV_BRIGHTNESS) == 0)
       *value = context_priv->procamp.brightness * 1000;
-   else if (strcmp(attr, XV_CONTRAST))
+   else if (strcmp(attr, XV_CONTRAST) == 0)
       *value = context_priv->procamp.contrast * 1000 - 1000;
-   else if (strcmp(attr, XV_SATURATION))
+   else if (strcmp(attr, XV_SATURATION) == 0)
       *value = context_priv->procamp.saturation * 1000 + 1000;
-   else if (strcmp(attr, XV_HUE))
+   else if (strcmp(attr, XV_HUE) == 0)
       *value = context_priv->procamp.hue * 1000;
-   else if (strcmp(attr, XV_COLORSPACE))
+   else if (strcmp(attr, XV_COLORSPACE) == 0)
       *value = context_priv->color_standard == VL_CSC_COLOR_STANDARD_BT_709;
    else
       return BadName;
diff --git a/src/gallium/state_trackers/xvmc/tests/xvmc_bench.c b/src/gallium/state_trackers/xvmc/tests/xvmc_bench.c
index 3cd23173c7c..dbd705639f6 100644
--- a/src/gallium/state_trackers/xvmc/tests/xvmc_bench.c
+++ b/src/gallium/state_trackers/xvmc/tests/xvmc_bench.c
@@ -123,11 +123,11 @@ void ParseArgs(int argc, char **argv, struct Config *config)
 
 			while (token && !fail)
 			{
-				if (strcmp(token, "i"))
+				if (strcmp(token, "i") == 0)
 					config->mb_types |= MB_TYPE_I;
-				else if (strcmp(token, "p"))
+				else if (strcmp(token, "p") == 0)
 					config->mb_types |= MB_TYPE_P;
-				else if (strcmp(token, "b"))
+				else if (strcmp(token, "b") == 0)
 					config->mb_types |= MB_TYPE_B;
 				else
 					fail = 1;
diff --git a/src/gallium/targets/d3dadapter9/meson.build b/src/gallium/targets/d3dadapter9/meson.build
index bc72b1110a0..b3c31c5dc6d 100644
--- a/src/gallium/targets/d3dadapter9/meson.build
+++ b/src/gallium/targets/d3dadapter9/meson.build
@@ -68,5 +68,5 @@ pkg.generate(
   description : 'Native D3D driver modules',
   version : '.'.join(nine_version),
   requires_private : 'libdrm >= ' + dep_libdrm.version(),
-  variables : ['moduledir=${prefix}/@0@'.format(d3d_drivers_path)],
+  variables : ['moduledir=@0@'.format(d3d_drivers_path)],
 )
diff --git a/src/gallium/targets/dri/Android.mk b/src/gallium/targets/dri/Android.mk
index 9c43fa1e8fd..6134251b5ca 100644
--- a/src/gallium/targets/dri/Android.mk
+++ b/src/gallium/targets/dri/Android.mk
@@ -40,12 +40,23 @@ LOCAL_LDFLAGS := \
 	-Wl,--undefined-version
 
 LOCAL_SHARED_LIBRARIES := \
-	libbacktrace \
 	libdl \
 	libglapi \
-	libexpat \
 	libz
 
+# If Android version >=8 MESA should static link libexpat else should dynamic link
+ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0)
+LOCAL_STATIC_LIBRARIES := \
+	libexpat
+else
+LOCAL_SHARED_LIBRARIES += \
+	libexpat
+endif
+
+ifeq ($(USE_LIBBACKTRACE),true)
+	LOCAL_SHARED_LIBRARIES += libbacktrace
+endif
+
 $(foreach d, $(MESA_BUILD_GALLIUM), $(eval LOCAL_CFLAGS += $(patsubst HAVE_%,-D%,$(d))))
 
 # sort GALLIUM_LIBS to remove any duplicates
diff --git a/src/gallium/targets/dri/meson.build b/src/gallium/targets/dri/meson.build
index 68d226621b2..edd0c007e48 100644
--- a/src/gallium/targets/dri/meson.build
+++ b/src/gallium/targets/dri/meson.build
@@ -60,6 +60,10 @@ libgallium_dri = shared_library(
     driver_tegra, driver_i915, driver_svga, driver_virgl,
     driver_swr,
   ],
+  # Will be deleted during installation, see install_megadrivers.py
+  install : true,
+  install_dir : dri_drivers_path,
+  name_suffix : 'so',
 )
 
 foreach d : [[with_gallium_kmsro, 'pl111_dri.so'],
diff --git a/src/gallium/targets/omx/meson.build b/src/gallium/targets/omx/meson.build
index 6811e6ff904..7772ae47bb5 100644
--- a/src/gallium/targets/omx/meson.build
+++ b/src/gallium/targets/omx/meson.build
@@ -32,7 +32,7 @@ endif
 
 libomx_gallium = shared_library(
   'omx_mesa',
-  'target.c',
+  ['target.c', xmlpool_options_h],
   c_args : c_vis_args,
   cpp_args : cpp_vis_args,
   link_args : [omx_link_args, ld_args_gc_sections],
diff --git a/src/gallium/targets/osmesa/meson.build b/src/gallium/targets/osmesa/meson.build
index b4ae8f4b6ec..e873e311aa0 100644
--- a/src/gallium/targets/osmesa/meson.build
+++ b/src/gallium/targets/osmesa/meson.build
@@ -43,9 +43,9 @@ libosmesa = shared_library(
     inc_gallium_drivers,
   ],
   link_depends : osmesa_link_deps,
-  link_whole : [libosmesa_st],
+  link_whole : [libosmesa_st, libglapi_static],
   link_with : [
-    libmesa_gallium, libgallium, libglapi_static, libws_null, osmesa_link_with,
+    libmesa_gallium, libgallium, libws_null, osmesa_link_with,
   ],
   dependencies : [
     dep_selinux, dep_thread, dep_clock, dep_unwind,
diff --git a/src/gallium/targets/va/meson.build b/src/gallium/targets/va/meson.build
index ded689b464d..4bfb5cbab7a 100644
--- a/src/gallium/targets/va/meson.build
+++ b/src/gallium/targets/va/meson.build
@@ -33,7 +33,7 @@ endif
 
 libva_gallium = shared_library(
   'gallium_drv_video',
-  'target.c',
+  ['target.c', xmlpool_options_h],
   c_args : c_vis_args,
   cpp_args : cpp_vis_args,
   link_args : [va_link_args, ld_args_gc_sections],
@@ -49,8 +49,10 @@ libva_gallium = shared_library(
     dep_libdrm, dep_thread, driver_r600, driver_radeonsi, driver_nouveau,
   ],
   link_depends : va_link_depends,
+  # Will be deleted during installation, see install_megadrivers.py
   install : true,
   install_dir : va_drivers_path,
+  name_suffix : 'so',
 )
 
 foreach d : [[with_gallium_r600, 'r600'],
diff --git a/src/gallium/targets/vdpau/meson.build b/src/gallium/targets/vdpau/meson.build
index 22e3f5ffdd8..48f01ffba6c 100644
--- a/src/gallium/targets/vdpau/meson.build
+++ b/src/gallium/targets/vdpau/meson.build
@@ -38,7 +38,7 @@ endif
 
 libvdpau_gallium = shared_library(
   'vdpau_gallium',
-  'target.c',
+  ['target.c', xmlpool_options_h],
   c_args : c_vis_args,
   cpp_args : cpp_vis_args,
   link_args : [vdpau_link_args, ld_args_gc_sections],
@@ -55,6 +55,10 @@ libvdpau_gallium = shared_library(
   ],
   link_depends : vdpau_link_depends,
   soversion : '@0@.@1@.0'.format(VDPAU_MAJOR, VDPAU_MINOR),
+  # Will be deleted during installation, see install_megadrivers.py
+  install : true,
+  install_dir : vdpau_drivers_path,
+  name_suffix : 'so',
 )
 foreach d : [[with_gallium_r300, 'r300'],
              [with_gallium_r600, 'r600'],
diff --git a/src/gallium/targets/xa/meson.build b/src/gallium/targets/xa/meson.build
index 733ef54ff85..582d5ef67f6 100644
--- a/src/gallium/targets/xa/meson.build
+++ b/src/gallium/targets/xa/meson.build
@@ -34,7 +34,7 @@ _xa_version = '.'.join(xa_version)
 
 libxatracker = shared_library(
   'xatracker',
-  'target.c',
+  ['target.c', xmlpool_options_h],
   c_args : c_vis_args,
   cpp_args : cpp_vis_args,
   link_args : [xa_link_args, ld_args_gc_sections],
diff --git a/src/gallium/targets/xvmc/meson.build b/src/gallium/targets/xvmc/meson.build
index 0af5b6477ce..537275aab57 100644
--- a/src/gallium/targets/xvmc/meson.build
+++ b/src/gallium/targets/xvmc/meson.build
@@ -33,7 +33,7 @@ endif
 
 libxvmc_gallium = shared_library(
   'XvMCgallium',
-  'target.c',
+  ['target.c', xmlpool_options_h],
   c_args : c_vis_args,
   cpp_args : cpp_vis_args,
   link_args : [xvmc_link_args, ld_args_gc_sections],
@@ -47,6 +47,10 @@ libxvmc_gallium = shared_library(
   ],
   dependencies : [dep_thread, driver_r600, driver_nouveau],
   link_depends : xvmc_link_depends,
+  # Will be deleted during installation, see install_megadrivers.py
+  install : true,
+  install_dir : xvmc_drivers_path,
+  name_suffix : 'so',
 )
 
 foreach d : [[with_gallium_r600, 'r600'], [with_gallium_nouveau, 'nouveau']]
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index b4e62acbae4..2e595e5a1b0 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -386,7 +386,8 @@ static bool amdgpu_cs_has_user_fence(struct amdgpu_cs_context *cs)
           cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCE &&
           cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_UVD_ENC &&
           cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCN_DEC &&
-          cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCN_ENC;
+          cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCN_ENC &&
+          cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCN_JPEG;
 }
 
 static bool amdgpu_cs_has_chaining(struct amdgpu_cs *cs)
@@ -1219,8 +1220,6 @@ static void amdgpu_add_fence_dependencies_bo_lists(struct amdgpu_cs *acs)
 {
    struct amdgpu_cs_context *cs = acs->csc;
 
-   cs->num_fence_dependencies = 0;
-
    amdgpu_add_fence_dependencies_bo_list(acs, cs->fence, cs->num_real_buffers, cs->real_buffers);
    amdgpu_add_fence_dependencies_bo_list(acs, cs->fence, cs->num_slab_buffers, cs->slab_buffers);
    amdgpu_add_fence_dependencies_bo_list(acs, cs->fence, cs->num_sparse_buffers, cs->sparse_buffers);
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
index 79d2c1345ef..45e54b4791d 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
@@ -92,6 +92,10 @@ static bool do_winsys_init(struct amdgpu_winsys *ws,
    if (!ac_query_gpu_info(fd, ws->dev, &ws->info, &ws->amdinfo))
       goto fail;
 
+   /* TODO: Enable this once the kernel handles it efficiently. */
+   if (ws->info.has_dedicated_vram)
+      ws->info.has_local_buffers = false;
+
    handle_env_var_force_family(ws);
 
    ws->addrlib = amdgpu_addr_create(&ws->info, &ws->amdinfo, &ws->info.max_alignment);
diff --git a/src/gallium/winsys/svga/drm/Makefile.sources b/src/gallium/winsys/svga/drm/Makefile.sources
index f82b0097b5b..191f0b88b4a 100644
--- a/src/gallium/winsys/svga/drm/Makefile.sources
+++ b/src/gallium/winsys/svga/drm/Makefile.sources
@@ -8,6 +8,8 @@ C_SOURCES := \
 	vmw_fence.c \
 	vmw_fence.h \
 	vmwgfx_drm.h \
+	vmw_msg.c \
+	vmw_msg.h \
 	vmw_screen.c \
 	vmw_screen_dri.c \
 	vmw_screen.h \
diff --git a/src/gallium/winsys/svga/drm/meson.build b/src/gallium/winsys/svga/drm/meson.build
index 24f67aca9ec..55266ce1623 100644
--- a/src/gallium/winsys/svga/drm/meson.build
+++ b/src/gallium/winsys/svga/drm/meson.build
@@ -23,6 +23,7 @@ files_svgadrm = files(
   'vmw_buffer.c',
   'vmw_context.c',
   'vmw_fence.c',
+  'vmw_msg.c',
   'vmw_screen.c',
   'vmw_screen_dri.c',
   'vmw_screen_ioctl.c',
diff --git a/src/gallium/drivers/svga/svga_msg.c b/src/gallium/winsys/svga/drm/vmw_msg.c
old mode 100755
new mode 100644
similarity index 93%
rename from src/gallium/drivers/svga/svga_msg.c
rename to src/gallium/winsys/svga/drm/vmw_msg.c
index 8b63132cb57..8cce2241f36
--- a/src/gallium/drivers/svga/svga_msg.c
+++ b/src/gallium/winsys/svga/drm/vmw_msg.c
@@ -29,7 +29,8 @@
 #include "util/u_memory.h"
 #include "util/u_string.h"
 #include "pipe/p_defines.h"
-#include "svga_msg.h"
+#include "svga_winsys.h"
+#include "vmw_msg.h"
 
 
 #define MESSAGE_STATUS_SUCCESS  0x0001
@@ -83,7 +84,7 @@
          port_num, magic,                  \
          ax, bx, cx, dx, si, di)           \
 ({                                         \
-   __asm__ volatile ("inl %%dx, %%eax;" :  \
+   __asm__ volatile ("inl %%dx, %%eax;" :      \
       "=a"(ax),                            \
       "=b"(bx),                            \
       "=c"(cx),                            \
@@ -128,7 +129,7 @@ typedef uint64_t VMW_REG;
          port_num, magic, bp,                     \
          ax, bx, cx, dx, si, di)                  \
 ({                                                \
-   __asm__ volatile ("push %%rbp;"                \
+   __asm__ volatile ("push %%rbp;"                    \
       "movq %12, %%rbp;"                          \
       "rep outsb;"                                \
       "pop %%rbp;" :                              \
@@ -152,7 +153,7 @@ typedef uint64_t VMW_REG;
          port_num, magic, bp,                     \
          ax, bx, cx, dx, si, di)                  \
 ({                                                \
-   __asm__ volatile ("push %%rbp;"                \
+   __asm__ volatile ("push %%rbp;"                    \
       "movq %12, %%rbp;"                          \
       "rep insb;"                                 \
       "pop %%rbp" :                               \
@@ -183,7 +184,7 @@ typedef uint32_t VMW_REG;
          port_num, magic, bp,                     \
          ax, bx, cx, dx, si, di)                  \
 ({                                                \
-   __asm__ volatile ("push %%ebp;"                \
+   __asm__ volatile ("push %%ebp;"                    \
       "mov %12, %%ebp;"                           \
       "rep outsb;"                                \
       "pop %%ebp;" :                              \
@@ -208,7 +209,7 @@ typedef uint32_t VMW_REG;
          port_num, magic, bp,                     \
          ax, bx, cx, dx, si, di)                  \
 ({                                                \
-   __asm__ volatile ("push %%ebp;"                \
+   __asm__ volatile ("push %%ebp;"                    \
       "mov %12, %%ebp;"                           \
       "rep insb;"                                 \
       "pop %%ebp" :                               \
@@ -252,7 +253,7 @@ typedef uint32_t VMW_REG;
          (void) in_cx; (void) bp;                 \
          (void) ax; (void) bx; (void) cx;         \
          (void) dx; (void) si; (void) di;
-			
+
 
 #define VMW_PORT_HB_IN(cmd, in_cx, in_si, in_di,  \
          port_num, magic, bp,                     \
@@ -283,7 +284,7 @@ struct rpc_channel {
 
 
 /**
- * svga_open_channel
+ * vmw_open_channel
  *
  * @channel: RPC channel
  * @protocol:
@@ -291,7 +292,7 @@ struct rpc_channel {
  * Returns: PIPE_OK on success, PIPE_ERROR otherwise
  */
 static enum pipe_error
-svga_open_channel(struct rpc_channel *channel, unsigned protocol)
+vmw_open_channel(struct rpc_channel *channel, unsigned protocol)
 {
    VMW_REG ax = 0, bx = 0, cx = 0, dx = 0, si = 0, di = 0;
 
@@ -321,7 +322,7 @@ svga_open_channel(struct rpc_channel *channel, unsigned protocol)
  * Returns: PIPE_OK on success, PIPE_ERROR otherwises
  */
 static enum pipe_error
-svga_close_channel(struct rpc_channel *channel)
+vmw_close_channel(struct rpc_channel *channel)
 {
    VMW_REG ax = 0, bx = 0, cx = 0, dx = 0, si, di;
 
@@ -344,7 +345,7 @@ svga_close_channel(struct rpc_channel *channel)
 
 
 /**
- * svga_send_msg: Sends a message to the host
+ * vmw_send_msg: Sends a message to the host
  *
  * @channel: RPC channel
  * @logmsg: NULL terminated string
@@ -352,7 +353,7 @@ svga_close_channel(struct rpc_channel *channel)
  * Returns: PIPE_OK on success
  */
 static enum pipe_error
-svga_send_msg(struct rpc_channel *channel, const char *msg)
+vmw_send_msg(struct rpc_channel *channel, const char *msg)
 {
    VMW_REG ax = 0, bx = 0, cx = 0, dx = 0, si, di, bp;
    size_t msg_len = strlen(msg);
@@ -406,46 +407,42 @@ svga_send_msg(struct rpc_channel *channel, const char *msg)
 
 
 /**
- * svga_host_log: Sends a log message to the host
+ * vmw_svga_winsys_host_log: Sends a log message to the host
  *
  * @log: NULL terminated string
  *
- * Returns: PIPE_OK on success
  */
-enum pipe_error
-svga_host_log(const char *log)
+void
+vmw_svga_winsys_host_log(struct svga_winsys_screen *sws, const char *log)
 {
    struct rpc_channel channel;
    char *msg;
    int msg_len;
-   enum pipe_error ret = PIPE_OK;
 
 #ifdef MSG_NOT_IMPLEMENTED
-   return ret;
+   return;
 #endif
 
    if (!log)
-      return ret;
+      return;
 
    msg_len = strlen(log) + strlen("log ") + 1;
    msg = CALLOC(1, msg_len);
    if (msg == NULL) {
       debug_printf("Cannot allocate memory for log message\n");
-      return PIPE_ERROR_OUT_OF_MEMORY;
+      return;
    }
 
    util_sprintf(msg, "log %s", log);
 
-   if (svga_open_channel(&channel, RPCI_PROTOCOL_NUM) ||
-       svga_send_msg(&channel, msg) ||
-       svga_close_channel(&channel)) {
+   if (vmw_open_channel(&channel, RPCI_PROTOCOL_NUM) ||
+       vmw_send_msg(&channel, msg) ||
+       vmw_close_channel(&channel)) {
       debug_printf("Failed to send log\n");
-
-      ret = PIPE_ERROR;
    }
 
    FREE(msg);
 
-   return ret;
+   return;
 }
 
diff --git a/src/gallium/drivers/svga/svga_msg.h b/src/gallium/winsys/svga/drm/vmw_msg.h
similarity index 89%
rename from src/gallium/drivers/svga/svga_msg.h
rename to src/gallium/winsys/svga/drm/vmw_msg.h
index 9132ba7e240..57057f23638 100644
--- a/src/gallium/drivers/svga/svga_msg.h
+++ b/src/gallium/winsys/svga/drm/vmw_msg.h
@@ -26,17 +26,16 @@
  * Author:
  *   Sinclair Yeh <syeh@vmware.com>
  */
-#ifndef _SVGA_MSG_H
-#define _SVGA_MSG_H
+#ifndef _VMW_MSG_H
+#define _VMW_MSG_H
 
 /**
- * svga_host_log: Sends a log message to the host
+ * vmw_host_log: Sends a log message to the host
  *
  * @log: NULL terminated string
  *
- * Returns: PIPE_OK on success
  */
-enum pipe_error svga_host_log(const char *log);
+void vmw_svga_winsys_host_log(struct svga_winsys_screen *sws, const char *log);
 
 #endif
 
diff --git a/src/gallium/winsys/svga/drm/vmw_screen_svga.c b/src/gallium/winsys/svga/drm/vmw_screen_svga.c
index a6990414e20..cd3f21f6033 100644
--- a/src/gallium/winsys/svga/drm/vmw_screen_svga.c
+++ b/src/gallium/winsys/svga/drm/vmw_screen_svga.c
@@ -48,6 +48,7 @@
 #include "vmw_surface.h"
 #include "vmw_buffer.h"
 #include "vmw_fence.h"
+#include "vmw_msg.h"
 #include "vmw_shader.h"
 #include "vmw_query.h"
 #include "svga3d_surfacedefs.h"
@@ -509,6 +510,8 @@ vmw_winsys_screen_init_svga(struct vmw_winsys_screen *vws)
    vws->base.stats_time_push = vmw_svga_winsys_stats_time_push;
    vws->base.stats_time_pop = vmw_svga_winsys_stats_time_pop;
 
+   vws->base.host_log = vmw_svga_winsys_host_log;
+
    return TRUE;
 }
 
diff --git a/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c b/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c
index 8753139107c..a4c1d50453b 100644
--- a/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c
+++ b/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c
@@ -396,6 +396,7 @@ xlib_displaytarget_create(struct sw_winsys *winsys,
 {
    struct xlib_displaytarget *xlib_dt;
    unsigned nblocksy, size;
+   int ignore;
 
    xlib_dt = CALLOC_STRUCT(xlib_displaytarget);
    if (!xlib_dt)
@@ -410,7 +411,8 @@ xlib_displaytarget_create(struct sw_winsys *winsys,
    xlib_dt->stride = align(util_format_get_stride(format, width), alignment);
    size = xlib_dt->stride * nblocksy;
 
-   if (!debug_get_option_xlib_no_shm()) {
+   if (!debug_get_option_xlib_no_shm() &&
+       XQueryExtension(xlib_dt->display, "MIT-SHM", &ignore, &ignore, &ignore)) {
       xlib_dt->data = alloc_shm(xlib_dt, size);
       if (xlib_dt->data) {
          xlib_dt->shm = True;
diff --git a/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c b/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c
index 26de8c702df..a2d232a539c 100644
--- a/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c
+++ b/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c
@@ -46,7 +46,7 @@
 
 
 #define VIRGL_DRM_VERSION(major, minor) ((major) << 16 | (minor))
-#define VIRGL_DRM_VERSION_FENCE_FD      VIRGL_DRM_VERSION(1, 0)
+#define VIRGL_DRM_VERSION_FENCE_FD      VIRGL_DRM_VERSION(0, 1)
 
 
 static inline boolean can_cache_resource(struct virgl_hw_res *res)
@@ -870,7 +870,7 @@ static int virgl_drm_get_version(int fd)
 	else if (version->version_major != 0)
 		ret = -EINVAL;
 	else
-		ret = version->version_minor;
+		ret = VIRGL_DRM_VERSION(0, version->version_minor);
 
 	drmFreeVersion(version);
 
diff --git a/src/glx/dri3_glx.c b/src/glx/dri3_glx.c
index 298adc80ef1..d53fc87e21e 100644
--- a/src/glx/dri3_glx.c
+++ b/src/glx/dri3_glx.c
@@ -642,7 +642,6 @@ dri3_set_swap_interval(__GLXDRIdrawable *pdraw, int interval)
       break;
    }
 
-   priv->swap_interval = interval;
    loader_dri3_set_swap_interval(&priv->loader_drawable, interval);
 
    return 0;
@@ -659,7 +658,7 @@ dri3_get_swap_interval(__GLXDRIdrawable *pdraw)
 
    struct dri3_drawable *priv =  (struct dri3_drawable *) pdraw;
 
-  return priv->swap_interval;
+  return priv->loader_drawable.swap_interval;
 }
 
 static void
diff --git a/src/glx/dri3_priv.h b/src/glx/dri3_priv.h
index 1d3c03f9997..32a8d3f7e7d 100644
--- a/src/glx/dri3_priv.h
+++ b/src/glx/dri3_priv.h
@@ -117,7 +117,6 @@ struct dri3_context
 struct dri3_drawable {
    __GLXDRIdrawable base;
    struct loader_dri3_drawable loader_drawable;
-   int swap_interval;
 
    /* LIBGL_SHOW_FPS support */
    uint64_t previous_ust;
diff --git a/src/glx/drisw_glx.c b/src/glx/drisw_glx.c
index 00c7fa100ab..48c03ca42e0 100644
--- a/src/glx/drisw_glx.c
+++ b/src/glx/drisw_glx.c
@@ -147,6 +147,9 @@ XDestroyDrawable(struct drisw_drawable * pdp, Display * dpy, XID drawable)
    if (pdp->ximage)
       XDestroyImage(pdp->ximage);
 
+   if (pdp->shminfo.shmid > 0)
+      XShmDetach(dpy, &pdp->shminfo);
+
    free(pdp->visinfo);
 
    XFreeGC(dpy, pdp->gc);
diff --git a/src/intel/Android.common.mk b/src/intel/Android.common.mk
index 12cea6e5472..79d9f1284a0 100644
--- a/src/intel/Android.common.mk
+++ b/src/intel/Android.common.mk
@@ -38,7 +38,17 @@ LOCAL_C_INCLUDES := \
 	$(MESA_TOP)/src/mapi \
 	$(MESA_TOP)/src/mesa
 
-LOCAL_SHARED_LIBRARIES := libexpat libz
+LOCAL_SHARED_LIBRARIES := libz liblog
+
+# If Android version >=8 MESA should static link libexpat else should dynamic link
+ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0)
+LOCAL_STATIC_LIBRARIES := \
+	libexpat
+else
+LOCAL_SHARED_LIBRARIES += \
+	libexpat
+endif
+
 
 LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_genxml
 
diff --git a/src/intel/Android.compiler.mk b/src/intel/Android.compiler.mk
index c2b01221dfc..41af7b20b9c 100644
--- a/src/intel/Android.compiler.mk
+++ b/src/intel/Android.compiler.mk
@@ -28,7 +28,7 @@
 # ---------------------------------------
 
 include $(CLEAR_VARS)
-
+LOCAL_CFLAGS += -Wno-error
 LOCAL_MODULE := libmesa_intel_compiler
 LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 
diff --git a/src/intel/Android.dev.mk b/src/intel/Android.dev.mk
index cd2ed66a176..3011ee232ed 100644
--- a/src/intel/Android.dev.mk
+++ b/src/intel/Android.dev.mk
@@ -33,5 +33,8 @@ LOCAL_C_INCLUDES := $(MESA_TOP)/include/drm-uapi
 
 LOCAL_SRC_FILES := $(DEV_FILES)
 
+LOCAL_CFLAGS := \
+           -Wno-gnu-variable-sized-type-not-at-end
+
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
diff --git a/src/intel/Android.vulkan.mk b/src/intel/Android.vulkan.mk
index 7019c8cbc8f..73586803552 100644
--- a/src/intel/Android.vulkan.mk
+++ b/src/intel/Android.vulkan.mk
@@ -23,9 +23,10 @@ LOCAL_PATH := $(call my-dir)
 include $(CLEAR_VARS)
 include $(LOCAL_PATH)/Makefile.sources
 
-VK_ENTRYPOINTS_SCRIPT := $(MESA_PYTHON2) $(LOCAL_PATH)/vulkan/anv_entrypoints_gen.py
-
-VK_EXTENSIONS_SCRIPT := $(MESA_PYTHON2) $(LOCAL_PATH)/vulkan/anv_extensions_gen.py
+ANV_ENTRYPOINTS_GEN_SCRIPT := $(LOCAL_PATH)/vulkan/anv_entrypoints_gen.py
+ANV_EXTENSIONS_GEN_SCRIPT := $(LOCAL_PATH)/vulkan/anv_extensions_gen.py
+ANV_EXTENSIONS_SCRIPT := $(LOCAL_PATH)/vulkan/anv_extensions.py
+VULKAN_API_XML := $(MESA_TOP)/src/vulkan/registry/vk.xml
 
 VULKAN_COMMON_INCLUDES := \
 	$(MESA_TOP)/include \
@@ -41,6 +42,18 @@ VULKAN_COMMON_INCLUDES := \
 	$(MESA_TOP)/src/compiler \
 	frameworks/native/vulkan/include
 
+ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0)
+VULKAN_COMMON_INCLUDES += \
+	frameworks/native/vulkan/include \
+	frameworks/native/libs/nativebase/include \
+	frameworks/native/libs/nativewindow/include \
+	frameworks/native/libs/arect/include
+
+VULKAN_COMMON_HEADER_LIBRARIES := \
+	libcutils_headers \
+	libhardware_headers
+endif
+
 # libmesa_anv_entrypoints with header and dummy.c
 #
 # This static library is built to pull entrypoints header
@@ -59,16 +72,28 @@ LOCAL_C_INCLUDES := \
 
 LOCAL_GENERATED_SOURCES += $(intermediates)/vulkan/anv_entrypoints.h
 LOCAL_GENERATED_SOURCES += $(intermediates)/vulkan/dummy.c
+LOCAL_GENERATED_SOURCES += $(intermediates)/vulkan/anv_extensions.h
 
 $(intermediates)/vulkan/dummy.c:
 	@mkdir -p $(dir $@)
 	@echo "Gen Dummy: $(PRIVATE_MODULE) <= $(notdir $(@))"
 	$(hide) touch $@
 
-$(intermediates)/vulkan/anv_entrypoints.h: $(intermediates)/vulkan/dummy.c
-	$(VK_ENTRYPOINTS_SCRIPT) \
+$(intermediates)/vulkan/anv_entrypoints.h: $(intermediates)/vulkan/dummy.c \
+					   $(ANV_ENTRYPOINTS_GEN_SCRIPT) \
+					   $(ANV_EXTENSIONS_SCRIPT) \
+					   $(VULKAN_API_XML)
+	$(MESA_PYTHON2) $(ANV_ENTRYPOINTS_GEN_SCRIPT) \
 		--outdir $(dir $@) \
-		--xml $(MESA_TOP)/src/vulkan/registry/vk.xml
+		--xml $(VULKAN_API_XML)
+
+$(intermediates)/vulkan/anv_extensions.h: $(ANV_ENTRYPOINTS_GEN_SCRIPT) \
+					  $(ANV_EXTENSIONS_SCRIPT) \
+					  $(VULKAN_API_XML)
+	@mkdir -p $(dir $@)
+	$(MESA_PYTHON2) $(ANV_EXTENSIONS_GEN_SCRIPT) \
+		--xml $(VULKAN_API_XML) \
+		--out-h $@
 
 LOCAL_EXPORT_C_INCLUDE_DIRS := \
         $(intermediates)
@@ -107,6 +132,7 @@ LOCAL_C_INCLUDES := $(ANV_INCLUDES)
 LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_anv_entrypoints libmesa_genxml
 
 LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES)
+LOCAL_HEADER_LIBRARIES += $(VULKAN_COMMON_HEADER_LIBRARIES)
 
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
@@ -127,6 +153,7 @@ LOCAL_C_INCLUDES := $(ANV_INCLUDES)
 LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_anv_entrypoints libmesa_genxml
 
 LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES)
+LOCAL_HEADER_LIBRARIES += $(VULKAN_COMMON_HEADER_LIBRARIES)
 
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
@@ -147,6 +174,7 @@ LOCAL_C_INCLUDES := $(ANV_INCLUDES)
 LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_anv_entrypoints libmesa_genxml
 
 LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES)
+LOCAL_HEADER_LIBRARIES += $(VULKAN_COMMON_HEADER_LIBRARIES)
 
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
@@ -167,6 +195,7 @@ LOCAL_C_INCLUDES := $(ANV_INCLUDES)
 LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_anv_entrypoints libmesa_genxml
 
 LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES)
+LOCAL_HEADER_LIBRARIES += $(VULKAN_COMMON_HEADER_LIBRARIES)
 
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
@@ -187,6 +216,7 @@ LOCAL_C_INCLUDES := $(ANV_INCLUDES)
 LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_anv_entrypoints libmesa_genxml
 
 LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES)
+LOCAL_HEADER_LIBRARIES += $(VULKAN_COMMON_HEADER_LIBRARIES)
 
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
@@ -207,6 +237,7 @@ LOCAL_C_INCLUDES := $(ANV_INCLUDES)
 LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_anv_entrypoints libmesa_genxml
 
 LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES)
+LOCAL_HEADER_LIBRARIES += $(VULKAN_COMMON_HEADER_LIBRARIES)
 
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
@@ -218,7 +249,7 @@ include $(BUILD_STATIC_LIBRARY)
 include $(CLEAR_VARS)
 LOCAL_MODULE := libmesa_vulkan_common
 LOCAL_MODULE_CLASS := STATIC_LIBRARIES
-
+LOCAL_CFLAGS += -Wno-error
 intermediates := $(call local-generated-sources-dir)
 
 LOCAL_SRC_FILES := $(VULKAN_FILES)
@@ -240,27 +271,25 @@ LOCAL_WHOLE_STATIC_LIBRARIES := \
 
 LOCAL_GENERATED_SOURCES += $(intermediates)/vulkan/anv_entrypoints.c
 LOCAL_GENERATED_SOURCES += $(intermediates)/vulkan/anv_extensions.c
-LOCAL_GENERATED_SOURCES += $(intermediates)/vulkan/anv_extensions.h
 
-$(intermediates)/vulkan/anv_entrypoints.c:
+$(intermediates)/vulkan/anv_entrypoints.c: $(ANV_ENTRYPOINTS_GEN_SCRIPT) \
+					   $(ANV_EXTENSIONS_SCRIPT) \
+					   $(VULKAN_API_XML)
 	@mkdir -p $(dir $@)
-	$(VK_ENTRYPOINTS_SCRIPT) \
-		--xml $(MESA_TOP)/src/vulkan/registry/vk.xml \
+	$(MESA_PYTHON2) $(ANV_ENTRYPOINTS_GEN_SCRIPT) \
+		--xml $(VULKAN_API_XML) \
 		--outdir $(dir $@)
 
-$(intermediates)/vulkan/anv_extensions.c:
+$(intermediates)/vulkan/anv_extensions.c: $(ANV_EXTENSIONS_GEN_SCRIPT) \
+					  $(ANV_EXTENSIONS_SCRIPT) \
+					  $(VULKAN_API_XML)
 	@mkdir -p $(dir $@)
-	$(VK_EXTENSIONS_SCRIPT) \
-		--xml $(MESA_TOP)/src/vulkan/registry/vk.xml \
+	$(MESA_PYTHON2) $(ANV_EXTENSIONS_GEN_SCRIPT) \
+		--xml $(VULKAN_API_XML) \
 		--out-c $@
 
-$(intermediates)/vulkan/anv_extensions.h:
-	@mkdir -p $(dir $@)
-	$(VK_EXTENSIONS_SCRIPT) \
-		--xml $(MESA_TOP)/src/vulkan/registry/vk.xml \
-		--out-h $@
-
 LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES)
+LOCAL_HEADER_LIBRARIES += $(VULKAN_COMMON_HEADER_LIBRARIES)
 
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
@@ -310,6 +339,16 @@ LOCAL_WHOLE_STATIC_LIBRARIES := \
 	libmesa_anv_entrypoints
 
 LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES) libz libsync liblog
+LOCAL_HEADER_LIBRARIES += $(VULKAN_COMMON_HEADER_LIBRARIES)
+
+# If Android version >=8 MESA should static link libexpat else should dynamic link
+ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0)
+LOCAL_STATIC_LIBRARIES := \
+       libexpat
+else
+ LOCAL_SHARED_LIBRARIES += \
+        libexpat
+endif
 
 include $(MESA_COMMON_MK)
 include $(BUILD_SHARED_LIBRARY)
diff --git a/src/intel/Makefile.isl.am b/src/intel/Makefile.isl.am
index a6733f3ba8e..dcb9d3ad6fc 100644
--- a/src/intel/Makefile.isl.am
+++ b/src/intel/Makefile.isl.am
@@ -33,12 +33,15 @@ ISL_GEN_LIBS =                                           \
 
 noinst_LTLIBRARIES += $(ISL_GEN_LIBS) \
                       isl/libisl.la \
-                      libisl_tiled_memcpy.la \
-                      libisl_tiled_memcpy_sse41.la
+                      libisl_tiled_memcpy.la
 
 isl_libisl_la_LIBADD = $(ISL_GEN_LIBS) \
-                       libisl_tiled_memcpy.la \
-                       libisl_tiled_memcpy_sse41.la
+                       libisl_tiled_memcpy.la
+
+if SSE41_SUPPORTED
+isl_libisl_la_LIBADD += libisl_tiled_memcpy_sse41.la
+noinst_LTLIBRARIES += libisl_tiled_memcpy_sse41.la
+endif
 
 isl_libisl_la_SOURCES = $(ISL_FILES) $(ISL_GENERATED_FILES)
 
diff --git a/src/intel/Makefile.vulkan.am b/src/intel/Makefile.vulkan.am
index b315f10a01a..cad0a57bc7f 100644
--- a/src/intel/Makefile.vulkan.am
+++ b/src/intel/Makefile.vulkan.am
@@ -253,6 +253,7 @@ VULKAN_TESTS = \
 	vulkan/tests/block_pool_no_free \
 	vulkan/tests/state_pool_no_free \
 	vulkan/tests/state_pool_free_list_only \
+	vulkan/tests/state_pool_padding \
 	vulkan/tests/state_pool
 
 VULKAN_TEST_LDADD = \
@@ -274,6 +275,10 @@ vulkan_tests_state_pool_free_list_only_CFLAGS = $(VULKAN_CFLAGS)
 vulkan_tests_state_pool_free_list_only_CPPFLAGS = $(VULKAN_CPPFLAGS)
 vulkan_tests_state_pool_free_list_only_LDADD = $(VULKAN_TEST_LDADD)
 
+vulkan_tests_state_pool_padding_CFLAGS = $(VULKAN_CFLAGS)
+vulkan_tests_state_pool_padding_CPPFLAGS = $(VULKAN_CPPFLAGS)
+vulkan_tests_state_pool_padding_LDADD = $(VULKAN_TEST_LDADD)
+
 vulkan_tests_state_pool_CFLAGS = $(VULKAN_CFLAGS)
 vulkan_tests_state_pool_CPPFLAGS = $(VULKAN_CPPFLAGS)
 vulkan_tests_state_pool_LDADD = $(VULKAN_TEST_LDADD)
diff --git a/src/intel/blorp/meson.build b/src/intel/blorp/meson.build
index c1201b0aa16..ff68d255164 100644
--- a/src/intel/blorp/meson.build
+++ b/src/intel/blorp/meson.build
@@ -33,5 +33,5 @@ libblorp = static_library(
   files_libblorp,
   include_directories : [inc_common, inc_intel],
   c_args : [c_vis_args, no_override_init_args],
-  dependencies : idep_nir_headers,
+  dependencies : [idep_nir_headers, idep_genxml],
 )
diff --git a/src/intel/common/gen_debug.c b/src/intel/common/gen_debug.c
index a978f2f5818..8990d208207 100644
--- a/src/intel/common/gen_debug.c
+++ b/src/intel/common/gen_debug.c
@@ -85,6 +85,7 @@ static const struct debug_control debug_control[] = {
    { "nohiz",       DEBUG_NO_HIZ },
    { "color",       DEBUG_COLOR },
    { "reemit",      DEBUG_REEMIT },
+   { "heur32",      DEBUG_HEUR32 },
    { NULL,    0 }
 };
 
diff --git a/src/intel/common/gen_debug.h b/src/intel/common/gen_debug.h
index 72d7ca20a39..c2ca2e2ebd6 100644
--- a/src/intel/common/gen_debug.h
+++ b/src/intel/common/gen_debug.h
@@ -83,6 +83,7 @@ extern uint64_t INTEL_DEBUG;
 #define DEBUG_NO_HIZ              (1ull << 39)
 #define DEBUG_COLOR               (1ull << 40)
 #define DEBUG_REEMIT              (1ull << 41)
+#define DEBUG_HEUR32              (1ull << 42)
 
 /* These flags are not compatible with the disk shader cache */
 #define DEBUG_DISK_CACHE_DISABLE_MASK DEBUG_SHADER_TIME
@@ -90,7 +91,7 @@ extern uint64_t INTEL_DEBUG;
 /* These flags may affect program generation */
 #define DEBUG_DISK_CACHE_MASK \
    (DEBUG_NO16 | DEBUG_NO_DUAL_OBJECT_GS | DEBUG_NO8 |  DEBUG_SPILL_FS | \
-   DEBUG_SPILL_VEC4 | DEBUG_NO_COMPACTION | DEBUG_DO32)
+   DEBUG_SPILL_VEC4 | DEBUG_NO_COMPACTION | DEBUG_DO32 | DEBUG_HEUR32)
 
 #ifdef HAVE_ANDROID_PLATFORM
 #define LOG_TAG "INTEL-MESA"
diff --git a/src/intel/common/meson.build b/src/intel/common/meson.build
index 332e978b0ad..ec45962502e 100644
--- a/src/intel/common/meson.build
+++ b/src/intel/common/meson.build
@@ -43,5 +43,5 @@ libintel_common = static_library(
   include_directories : [inc_common, inc_intel],
   c_args : [c_vis_args, no_override_init_args],
   link_with : [libisl],
-  dependencies : [dep_expat, dep_libdrm, dep_thread],
+  dependencies : [dep_expat, dep_libdrm, dep_thread, idep_genxml],
 )
diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
index 61a4528d372..c294e5c3222 100644
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -38,6 +38,15 @@ struct ra_regs;
 struct nir_shader;
 struct brw_program;
 
+struct brw_simd32_heuristics_control {
+   bool grouped_sends_check;
+   int max_grouped_sends;
+   bool inst_count_check;
+   float inst_count_ratio;
+   bool mrt_check;
+   int max_mrts;
+};
+
 struct brw_compiler {
    const struct gen_device_info *devinfo;
 
@@ -118,6 +127,8 @@ struct brw_compiler {
     * whether nir_opt_large_constants will be run.
     */
    bool supports_shader_constants;
+
+   struct brw_simd32_heuristics_control simd32_heuristics_control;
 };
 
 /**
@@ -196,6 +207,9 @@ struct brw_sampler_prog_key_data {
    uint32_t yx_xuxv_image_mask;
    uint32_t xy_uxvx_image_mask;
    uint32_t ayuv_image_mask;
+
+   /* Scale factor for each texture. */
+   float scale_factors[32];
 };
 
 /**
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 8dd3b94fbd5..5b29292d6a0 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -251,6 +251,62 @@ fs_inst::is_send_from_grf() const
    }
 }
 
+bool
+fs_inst::is_control_source(unsigned arg) const
+{
+   switch (opcode) {
+   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
+   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
+      return arg == 0;
+
+   case SHADER_OPCODE_BROADCAST:
+   case SHADER_OPCODE_SHUFFLE:
+   case SHADER_OPCODE_QUAD_SWIZZLE:
+   case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
+   case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
+   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
+   case SHADER_OPCODE_IMAGE_SIZE:
+   case SHADER_OPCODE_GET_BUFFER_SIZE:
+      return arg == 1;
+
+   case SHADER_OPCODE_MOV_INDIRECT:
+   case SHADER_OPCODE_CLUSTER_BROADCAST:
+   case SHADER_OPCODE_TEX:
+   case FS_OPCODE_TXB:
+   case SHADER_OPCODE_TXD:
+   case SHADER_OPCODE_TXF:
+   case SHADER_OPCODE_TXF_LZ:
+   case SHADER_OPCODE_TXF_CMS:
+   case SHADER_OPCODE_TXF_CMS_W:
+   case SHADER_OPCODE_TXF_UMS:
+   case SHADER_OPCODE_TXF_MCS:
+   case SHADER_OPCODE_TXL:
+   case SHADER_OPCODE_TXL_LZ:
+   case SHADER_OPCODE_TXS:
+   case SHADER_OPCODE_LOD:
+   case SHADER_OPCODE_TG4:
+   case SHADER_OPCODE_TG4_OFFSET:
+   case SHADER_OPCODE_SAMPLEINFO:
+   case SHADER_OPCODE_UNTYPED_ATOMIC:
+   case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT:
+   case SHADER_OPCODE_UNTYPED_SURFACE_READ:
+   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
+   case SHADER_OPCODE_BYTE_SCATTERED_READ:
+   case SHADER_OPCODE_BYTE_SCATTERED_WRITE:
+   case SHADER_OPCODE_TYPED_ATOMIC:
+   case SHADER_OPCODE_TYPED_SURFACE_READ:
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+      return arg == 1 || arg == 2;
+
+   case SHADER_OPCODE_SEND:
+      return arg == 0 || arg == 1;
+
+   default:
+      return false;
+   }
+}
+
 /**
  * Returns true if this instruction's sources and destinations cannot
  * safely be the same register.
@@ -3061,6 +3117,7 @@ fs_visitor::opt_peephole_csel()
 
             if (csel_inst != NULL) {
                progress = true;
+               csel_inst->saturate = inst->saturate;
                inst->remove(block);
             }
 
@@ -3899,18 +3956,22 @@ fs_visitor::lower_integer_multiplication()
 
             bool needs_mov = false;
             fs_reg orig_dst = inst->dst;
+
+            /* Get a new VGRF for the "low" 32x16-bit multiplication result if
+             * reusing the original destination is impossible due to hardware
+             * restrictions, source/destination overlap, or it being the null
+             * register.
+             */
             fs_reg low = inst->dst;
             if (orig_dst.is_null() || orig_dst.file == MRF ||
                 regions_overlap(inst->dst, inst->size_written,
                                 inst->src[0], inst->size_read(0)) ||
                 regions_overlap(inst->dst, inst->size_written,
-                                inst->src[1], inst->size_read(1))) {
+                                inst->src[1], inst->size_read(1)) ||
+                inst->dst.stride >= 4) {
                needs_mov = true;
-               /* Get a new VGRF but keep the same stride as inst->dst */
                low = fs_reg(VGRF, alloc.allocate(regs_written(inst)),
                             inst->dst.type);
-               low.stride = inst->dst.stride;
-               low.offset = inst->dst.offset % REG_SIZE;
             }
 
             /* Get a new VGRF but keep the same stride as inst->dst */
@@ -7542,6 +7603,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
                char **error_str)
 {
    const struct gen_device_info *devinfo = compiler->devinfo;
+   bool simd16_failed = false;
+   bool simd16_spilled = false;
 
    shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true);
    brw_nir_lower_fs_inputs(shader, devinfo, key);
@@ -7608,10 +7671,12 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
                      shader_time_index16);
       v16.import_uniforms(&v8);
       if (!v16.run_fs(allow_spilling, use_rep_send)) {
+         simd16_failed = true;
          compiler->shader_perf_log(log_data,
                                    "SIMD16 shader failed to compile: %s",
                                    v16.fail_msg);
       } else {
+         simd16_spilled = v16.spilled_any_registers;
          simd16_cfg = v16.cfg;
          prog_data->dispatch_grf_start_reg_16 = v16.payload.num_regs;
          prog_data->reg_blocks_16 = brw_register_blocks(v16.grf_used);
@@ -7619,9 +7684,17 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
    }
 
    /* Currently, the compiler only supports SIMD32 on SNB+ */
+   const brw_simd32_heuristics_control *ctrl = &compiler->simd32_heuristics_control;
+   uint64_t mrts = shader->info.outputs_written << FRAG_RESULT_DATA0;
+
    if (v8.max_dispatch_width >= 32 && !use_rep_send &&
        compiler->devinfo->gen >= 6 &&
-       unlikely(INTEL_DEBUG & DEBUG_DO32)) {
+       (unlikely(INTEL_DEBUG & DEBUG_DO32) ||
+        (unlikely(INTEL_DEBUG & DEBUG_HEUR32) &&
+         !simd16_failed && !simd16_spilled &&
+         (!ctrl->mrt_check ||
+          (ctrl->mrt_check &&
+          u_count_bits64(&mrts) <= ctrl->max_mrts))))) {
       /* Try a SIMD32 compile */
       fs_visitor v32(compiler, log_data, mem_ctx, key,
                      &prog_data->base, prog, shader, 32,
@@ -7632,9 +7705,12 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
                                    "SIMD32 shader failed to compile: %s",
                                    v32.fail_msg);
       } else {
-         simd32_cfg = v32.cfg;
-         prog_data->dispatch_grf_start_reg_32 = v32.payload.num_regs;
-         prog_data->reg_blocks_32 = brw_register_blocks(v32.grf_used);
+         if (likely(!(INTEL_DEBUG & DEBUG_HEUR32)) ||
+              v32.run_heuristic(ctrl)) {
+            simd32_cfg = v32.cfg;
+            prog_data->dispatch_grf_start_reg_32 = v32.payload.num_regs;
+            prog_data->reg_blocks_32 = brw_register_blocks(v32.grf_used);
+         }
       }
    }
 
@@ -7713,13 +7789,49 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
    }
 
    if (simd32_cfg) {
-      prog_data->dispatch_32 = true;
-      prog_data->prog_offset_32 = g.generate_code(simd32_cfg, 32);
+      uint32_t offset = g.generate_code(simd32_cfg, 32);
+
+      if (unlikely(INTEL_DEBUG & DEBUG_DO32) ||
+          (unlikely(INTEL_DEBUG & DEBUG_HEUR32) &&
+           (!simd16_cfg ||
+            (simd16_cfg &&
+             (!ctrl->inst_count_check ||
+             (ctrl->inst_count_check &&
+             (float)g.get_inst_count(32) / (float)g.get_inst_count(16) <= ctrl->inst_count_ratio)))))) {
+         prog_data->dispatch_32 = true;
+         prog_data->prog_offset_32 = offset;
+      }
    }
 
    return g.get_assembly();
 }
 
+bool
+fs_visitor::run_heuristic(const struct brw_simd32_heuristics_control *ctrl) {
+   int grouped_sends = 0;
+   int max_grouped_sends = 0;
+   bool pass = true;
+
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      if (inst->opcode >= SHADER_OPCODE_TEX && inst->opcode <= SHADER_OPCODE_SAMPLEINFO_LOGICAL) {
+         ++grouped_sends;
+      } else if (grouped_sends > 0) {
+         if (grouped_sends > max_grouped_sends) {
+            max_grouped_sends = grouped_sends;
+         }
+         grouped_sends = 0;
+      }
+   }
+
+   if (ctrl->grouped_sends_check) {
+      if (max_grouped_sends > ctrl->max_grouped_sends) {
+         pass = false;
+      }
+   }
+
+   return pass;
+}
+
 fs_reg *
 fs_visitor::emit_cs_work_group_id_setup()
 {
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index 5361b768003..72acf85581e 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -289,6 +289,8 @@ class fs_visitor : public backend_shader
    void dump_instruction(backend_instruction *inst);
    void dump_instruction(backend_instruction *inst, FILE *file);
 
+   bool run_heuristic(const struct brw_simd32_heuristics_control *ctrl);
+
    const void *const key;
    const struct brw_sampler_prog_key_data *key_tex;
 
@@ -400,6 +402,7 @@ class fs_generator
 
    void enable_debug(const char *shader_name);
    int generate_code(const cfg_t *cfg, int dispatch_width);
+   int get_inst_count(int dispatch_width);
    const unsigned *get_assembly();
 
 private:
@@ -495,6 +498,7 @@ class fs_generator
    struct brw_stage_prog_data * const prog_data;
 
    unsigned dispatch_width; /**< 8, 16 or 32 */
+   int inst_count[3]; /* for 8, 16 and 32 */
 
    exec_list discard_halt_patches;
    unsigned promoted_constants;
diff --git a/src/intel/compiler/brw_fs_cmod_propagation.cpp b/src/intel/compiler/brw_fs_cmod_propagation.cpp
index 5fb522f810f..b58730fbbe5 100644
--- a/src/intel/compiler/brw_fs_cmod_propagation.cpp
+++ b/src/intel/compiler/brw_fs_cmod_propagation.cpp
@@ -255,6 +255,13 @@ opt_cmod_propagation_local(const gen_device_info *devinfo, bblock_t *block)
             if (inst->opcode == BRW_OPCODE_AND)
                break;
 
+            /* Not safe to use inequality operators if the types are different
+             */
+            if (scan_inst->dst.type != inst->src[0].type &&
+                inst->conditional_mod != BRW_CONDITIONAL_Z &&
+                inst->conditional_mod != BRW_CONDITIONAL_NZ)
+               break;
+
             /* Comparisons operate differently for ints and floats */
             if (scan_inst->dst.type != inst->dst.type &&
                 (scan_inst->dst.type == BRW_REGISTER_TYPE_F ||
diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp b/src/intel/compiler/brw_fs_copy_propagation.cpp
index c23ce1ef426..bba7eb35830 100644
--- a/src/intel/compiler/brw_fs_copy_propagation.cpp
+++ b/src/intel/compiler/brw_fs_copy_propagation.cpp
@@ -32,9 +32,10 @@
  * 12.5 (p356).
  */
 
-#define ACP_HASH_SIZE 16
+#define ACP_HASH_SIZE 64
 
 #include "util/bitset.h"
+#include "util/u_math.h"
 #include "brw_fs.h"
 #include "brw_fs_live_variables.h"
 #include "brw_cfg.h"
@@ -46,6 +47,7 @@ namespace { /* avoid conflict with opt_copy_propagation_elements */
 struct acp_entry : public exec_node {
    fs_reg dst;
    fs_reg src;
+   unsigned global_idx;
    uint8_t size_written;
    uint8_t size_read;
    enum opcode opcode;
@@ -142,6 +144,8 @@ fs_copy_prop_dataflow::fs_copy_prop_dataflow(void *mem_ctx, cfg_t *cfg,
          foreach_in_list(acp_entry, entry, &out_acp[block->num][i]) {
             acp[next_acp] = entry;
 
+            entry->global_idx = next_acp;
+
             /* opt_copy_propagation_local populates out_acp with copies created
              * in a block which are still live at the end of the block.  This
              * is exactly what we want in the COPY set.
@@ -167,21 +171,74 @@ void
 fs_copy_prop_dataflow::setup_initial_values()
 {
    /* Initialize the COPY and KILL sets. */
-   foreach_block (block, cfg) {
-      foreach_inst_in_block(fs_inst, inst, block) {
-         if (inst->dst.file != VGRF)
-            continue;
+   {
+      /* Create a temporary table of ACP entries which we'll use for efficient
+       * look-up.  Unfortunately, we have to do this in two steps because we
+       * have to match both sources and destinations and an ACP entry can only
+       * be in one list at a time.
+       *
+       * We choose to make the table size between num_acp/2 and num_acp/4 to
+       * try and trade off between the time it takes to initialize the table
+       * via exec_list constructors or make_empty() and the cost of
+       * collisions.  In practice, it doesn't appear to matter too much what
+       * size we make the table as long as it's roughly the same order of
+       * magnitude as num_acp.  We get most of the benefit of the table
+       * approach even if we use a table of size ACP_HASH_SIZE though a
+       * full-sized table is 1-2% faster in practice.
+       */
+      unsigned acp_table_size = util_next_power_of_two(num_acp) / 4;
+      acp_table_size = MAX2(acp_table_size, ACP_HASH_SIZE);
+      exec_list *acp_table = new exec_list[acp_table_size];
 
-         /* Mark ACP entries which are killed by this instruction. */
-         for (int i = 0; i < num_acp; i++) {
-            if (regions_overlap(inst->dst, inst->size_written,
-                                acp[i]->dst, acp[i]->size_written) ||
-                regions_overlap(inst->dst, inst->size_written,
-                                acp[i]->src, acp[i]->size_read)) {
-               BITSET_SET(bd[block->num].kill, i);
+      /* First, get all the KILLs for instructions which overwrite ACP
+       * destinations.
+       */
+      for (int i = 0; i < num_acp; i++) {
+         unsigned idx = acp[i]->dst.nr & (acp_table_size - 1);
+         acp_table[idx].push_tail(acp[i]);
+      }
+
+      foreach_block (block, cfg) {
+         foreach_inst_in_block(fs_inst, inst, block) {
+            if (inst->dst.file != VGRF)
+               continue;
+
+            unsigned idx = inst->dst.nr & (acp_table_size - 1);
+            foreach_in_list(acp_entry, entry, &acp_table[idx]) {
+               if (regions_overlap(inst->dst, inst->size_written,
+                                   entry->dst, entry->size_written))
+                  BITSET_SET(bd[block->num].kill, entry->global_idx);
             }
          }
       }
+
+      /* Clear the table for the second pass */
+      for (unsigned i = 0; i < acp_table_size; i++)
+         acp_table[i].make_empty();
+
+      /* Next, get all the KILLs for instructions which overwrite ACP
+       * sources.
+       */
+      for (int i = 0; i < num_acp; i++) {
+         unsigned idx = acp[i]->src.nr & (acp_table_size - 1);
+         acp_table[idx].push_tail(acp[i]);
+      }
+
+      foreach_block (block, cfg) {
+         foreach_inst_in_block(fs_inst, inst, block) {
+            if (inst->dst.file != VGRF)
+               continue;
+
+            unsigned idx = inst->dst.nr & (acp_table_size - 1);
+            foreach_in_list(acp_entry, entry, &acp_table[idx]) {
+               if (regions_overlap(inst->dst, inst->size_written,
+                                   entry->src, entry->size_read))
+                  BITSET_SET(bd[block->num].kill, entry->global_idx);
+            }
+         }
+      }
+
+      delete [] acp_table;
    }
 
    /* Populate the initial values for the livein and liveout sets.  For the
@@ -904,6 +961,25 @@ fs_visitor::opt_copy_propagation()
    foreach_block (block, cfg) {
       progress = opt_copy_propagation_local(copy_prop_ctx, block,
                                             out_acp[block->num]) || progress;
+
+      /* If the destination of an ACP entry exists only within this block,
+       * then there's no need to keep it for dataflow analysis.  We can delete
+       * it from the out_acp table and avoid growing the bitsets any bigger
+       * than we absolutely have to.
+       *
+       * Because nothing in opt_copy_propagation_local touches the block
+       * start/end IPs and opt_copy_propagation_local is incapable of
+       * extending the live range of an ACP destination beyond the block,
+       * it's safe to use the liveness information in this way.
+       */
+      for (unsigned a = 0; a < ACP_HASH_SIZE; a++) {
+         foreach_in_list_safe(acp_entry, entry, &out_acp[block->num][a]) {
+            assert(entry->dst.file == VGRF);
+            if (block->start_ip <= virtual_grf_start[entry->dst.nr] &&
+                virtual_grf_end[entry->dst.nr] <= block->end_ip)
+               entry->remove();
+         }
+      }
    }
 
    /* Do dataflow analysis for those available copies. */
diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp
index e3b68fa3165..82c2713a77f 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -90,9 +90,16 @@ brw_reg_from_fs_reg(const struct gen_device_info *devinfo, fs_inst *inst,
           *       different execution size when the number of components
           *       written to each destination GRF is not the same.
           */
-         const unsigned width = MIN2(reg_width, phys_width);
-         brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0);
-         brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride);
+         if (reg->stride > 4) {
+            assert(reg != &inst->dst);
+            assert(reg->stride * type_sz(reg->type) <= REG_SIZE);
+            brw_reg = brw_vecn_reg(1, brw_file_from_reg(reg), reg->nr, 0);
+            brw_reg = stride(brw_reg, reg->stride, 1, 0);
+         } else {
+            const unsigned width = MIN2(reg_width, phys_width);
+            brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0);
+            brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride);
+         }
 
          if (devinfo->gen == 7 && !devinfo->is_haswell) {
             /* From the IvyBridge PRM (EU Changes by Processor Generation, page 13):
@@ -2093,6 +2100,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
          break;
 
       case SHADER_OPCODE_INTERLOCK:
+         assert(devinfo->gen >= 9);
          /* The interlock is basically a memory fence issued via sendc */
          brw_memory_fence(p, dst, BRW_OPCODE_SENDC);
          break;
@@ -2289,6 +2297,8 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
                               fill_count, promoted_constants, before_size,
                               after_size);
 
+   inst_count[ffs(dispatch_width) - 4] = before_size / 16;
+
    return start_offset;
 }
 
@@ -2297,3 +2307,13 @@ fs_generator::get_assembly()
 {
    return brw_get_program(p, &prog_data->program_size);
 }
+
+int
+fs_generator::get_inst_count(int dispatch_width)
+{
+   if (dispatch_width == 8 || dispatch_width == 16 || dispatch_width == 32) {
+      return inst_count[ffs(dispatch_width) - 4];
+   } else {
+      return 0;
+   }
+}
\ No newline at end of file
diff --git a/src/intel/compiler/brw_fs_lower_regioning.cpp b/src/intel/compiler/brw_fs_lower_regioning.cpp
index df50993dee6..c60d4700419 100644
--- a/src/intel/compiler/brw_fs_lower_regioning.cpp
+++ b/src/intel/compiler/brw_fs_lower_regioning.cpp
@@ -71,15 +71,33 @@ namespace {
           !is_byte_raw_mov(inst)) {
          return get_exec_type_size(inst);
       } else {
-         unsigned stride = inst->dst.stride * type_sz(inst->dst.type);
+         /* Calculate the maximum byte stride and the minimum/maximum type
+          * size across all source and destination operands we are required to
+          * lower.
+          */
+         unsigned max_stride = inst->dst.stride * type_sz(inst->dst.type);
+         unsigned min_size = type_sz(inst->dst.type);
+         unsigned max_size = type_sz(inst->dst.type);
 
          for (unsigned i = 0; i < inst->sources; i++) {
-            if (!is_uniform(inst->src[i]))
-               stride = MAX2(stride, inst->src[i].stride *
-                             type_sz(inst->src[i].type));
+            if (!is_uniform(inst->src[i]) && !inst->is_control_source(i)) {
+               const unsigned size = type_sz(inst->src[i].type);
+               max_stride = MAX2(max_stride, inst->src[i].stride * size);
+               min_size = MIN2(min_size, size);
+               max_size = MAX2(max_size, size);
+            }
          }
 
-         return stride;
+         /* All operands involved in lowering need to fit in the calculated
+          * stride.
+          */
+         assert(max_size <= 4 * min_size);
+
+         /* Attempt to use the largest byte stride among all present operands,
+          * but never exceed a stride of 4 since that would lead to illegal
+          * destination regions during lowering.
+          */
+         return MIN2(max_stride, 4 * min_size);
       }
    }
 
@@ -92,7 +110,7 @@ namespace {
    required_dst_byte_offset(const fs_inst *inst)
    {
       for (unsigned i = 0; i < inst->sources; i++) {
-         if (!is_uniform(inst->src[i]))
+         if (!is_uniform(inst->src[i]) && !inst->is_control_source(i))
             if (reg_offset(inst->src[i]) % REG_SIZE !=
                 reg_offset(inst->dst) % REG_SIZE)
                return 0;
@@ -109,7 +127,7 @@ namespace {
    has_invalid_src_region(const gen_device_info *devinfo, const fs_inst *inst,
                           unsigned i)
    {
-      if (is_unordered(inst)) {
+      if (is_unordered(inst) || inst->is_control_source(i)) {
          return false;
       } else {
          const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type);
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index f16627b8a64..6f0d9731cfe 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -512,6 +512,15 @@ fs_visitor::optimize_extract_to_float(nir_alu_instr *instr,
        src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16)
       return false;
 
+   /* If either opcode has source modifiers, bail.
+    *
+    * TODO: We can potentially handle source modifiers if both of the opcodes
+    * we're combining are signed integers.
+    */
+   if (instr->src[0].abs || instr->src[0].negate ||
+       src0->src[0].abs || src0->src[0].negate)
+      return false;
+
    unsigned element = nir_src_as_uint(src0->src[1].src);
 
    /* Element type to extract.*/
@@ -1484,16 +1493,25 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
        *    Use two instructions and a word or DWord intermediate integer type.
        */
       if (nir_dest_bit_size(instr->dest.dest) == 64) {
-         const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i8);
+         const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
 
          if (instr->op == nir_op_extract_i8) {
             /* If we need to sign extend, extract to a word first */
             fs_reg w_temp = bld.vgrf(BRW_REGISTER_TYPE_W);
             bld.MOV(w_temp, subscript(op[0], type, byte));
             bld.MOV(result, w_temp);
+         } else if (byte & 1) {
+            /* Extract the high byte from the word containing the desired byte
+             * offset.
+             */
+            bld.SHR(result,
+                    subscript(op[0], BRW_REGISTER_TYPE_UW, byte / 2),
+                    brw_imm_uw(8));
          } else {
             /* Otherwise use an AND with 0xff and a word type */
-            bld.AND(result, subscript(op[0], type, byte / 2), brw_imm_uw(0xff));
+            bld.AND(result,
+                    subscript(op[0], BRW_REGISTER_TYPE_UW, byte / 2),
+                    brw_imm_uw(0xff));
          }
       } else {
          const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
diff --git a/src/intel/compiler/brw_fs_reg_allocate.cpp b/src/intel/compiler/brw_fs_reg_allocate.cpp
index 6961cb1caf4..6e18bdfe68a 100644
--- a/src/intel/compiler/brw_fs_reg_allocate.cpp
+++ b/src/intel/compiler/brw_fs_reg_allocate.cpp
@@ -591,7 +591,7 @@ fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
     */
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
       if (inst->dst.file == VGRF && inst->has_source_and_destination_hazard()) {
-         for (unsigned i = 0; i < 3; i++) {
+         for (unsigned i = 0; i < inst->sources; i++) {
             if (inst->src[i].file == VGRF) {
                ra_add_node_interference(g, inst->dst.nr, inst->src[i].nr);
             }
@@ -667,15 +667,14 @@ fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
        * messages adding a node interference to the grf127_send_hack_node.
        * This node has a fixed asignment to grf127.
        *
-       * We don't apply it to SIMD16 because previous code avoids any register
-       * overlap between sources and destination.
+       * We don't apply it to SIMD16 instructions because previous code avoids
+       * any register overlap between sources and destination.
        */
       ra_set_node_reg(g, grf127_send_hack_node, 127);
-      if (dispatch_width == 8) {
-         foreach_block_and_inst(block, fs_inst, inst, cfg) {
-            if (inst->is_send_from_grf() && inst->dst.file == VGRF)
-               ra_add_node_interference(g, inst->dst.nr, grf127_send_hack_node);
-         }
+      foreach_block_and_inst(block, fs_inst, inst, cfg) {
+         if (inst->exec_size < 16 && inst->is_send_from_grf() &&
+             inst->dst.file == VGRF)
+            ra_add_node_interference(g, inst->dst.nr, grf127_send_hack_node);
       }
 
       if (spilled_any_registers) {
@@ -711,14 +710,9 @@ fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
          if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 &&
              inst->src[2].file == VGRF &&
              inst->src[3].file == VGRF &&
-             inst->src[2].nr != inst->src[3].nr) {
-            for (unsigned i = 0; i < inst->mlen; i++) {
-               for (unsigned j = 0; j < inst->ex_mlen; j++) {
-                  ra_add_node_interference(g, inst->src[2].nr + i,
-                                           inst->src[3].nr + j);
-               }
-            }
-         }
+             inst->src[2].nr != inst->src[3].nr)
+            ra_add_node_interference(g, inst->src[2].nr,
+                                     inst->src[3].nr);
       }
    }
 
diff --git a/src/intel/compiler/brw_ir_fs.h b/src/intel/compiler/brw_ir_fs.h
index d05357e822e..c4427a658b0 100644
--- a/src/intel/compiler/brw_ir_fs.h
+++ b/src/intel/compiler/brw_ir_fs.h
@@ -357,6 +357,13 @@ class fs_inst : public backend_instruction {
    bool can_change_types() const;
    bool has_source_and_destination_hazard() const;
 
+   /**
+    * Return whether \p arg is a control source of a virtual instruction which
+    * shouldn't contribute to the execution type and usual regioning
+    * restriction calculations of arithmetic instructions.
+    */
+   bool is_control_source(unsigned arg) const;
+
    /**
     * Return the subset of flag registers read by the instruction as a bitset
     * with byte granularity.
@@ -461,7 +468,8 @@ get_exec_type(const fs_inst *inst)
    brw_reg_type exec_type = BRW_REGISTER_TYPE_B;
 
    for (int i = 0; i < inst->sources; i++) {
-      if (inst->src[i].file != BAD_FILE) {
+      if (inst->src[i].file != BAD_FILE &&
+          !inst->is_control_source(i)) {
          const brw_reg_type t = get_exec_type(inst->src[i].type);
          if (type_sz(t) > type_sz(exec_type))
             exec_type = t;
diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
index 9dbf06004a4..6b8f4d30c1a 100644
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -590,9 +590,9 @@ brw_nir_optimize(nir_shader *nir, const struct brw_compiler *compiler,
       const bool is_vec4_tessellation = !is_scalar &&
          (nir->info.stage == MESA_SHADER_TESS_CTRL ||
           nir->info.stage == MESA_SHADER_TESS_EVAL);
-      OPT(nir_opt_peephole_select, 0, !is_vec4_tessellation, false);
-      OPT(nir_opt_peephole_select, 1, !is_vec4_tessellation,
-          compiler->devinfo->gen >= 6);
+      OPT(nir_opt_peephole_select, 0, !is_vec4_tessellation);
+      if (compiler->devinfo->gen >= 6)
+         OPT(nir_opt_peephole_select, 1, !is_vec4_tessellation);
 
       OPT(nir_opt_intrinsics);
       OPT(nir_opt_idiv_const, 32);
@@ -794,6 +794,17 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir)
 
    OPT(brw_nir_lower_mem_access_bit_sizes);
 
+   /* Lower array derefs of vectors for SSBO and UBO loads.  For both UBOs and
+    * SSBOs, our back-end is capable of loading an entire vec4 at a time and
+    * we would like to take advantage of that whenever possible regardless of
+    * whether or not the app gives us full loads.  This should allow the
+    * optimizer to combine UBO and SSBO load operations and save us some send
+    * messages.
+    */
+   OPT(nir_lower_array_deref_of_vec,
+       nir_var_mem_ubo | nir_var_mem_ssbo,
+       nir_lower_direct_array_deref_of_vec_load);
+
    /* Get rid of split copies */
    nir = brw_nir_optimize(nir, compiler, is_scalar, false);
 
@@ -842,6 +853,23 @@ brw_nir_link_shaders(const struct brw_compiler *compiler,
       *producer = brw_nir_optimize(*producer, compiler, p_is_scalar, false);
       *consumer = brw_nir_optimize(*consumer, compiler, c_is_scalar, false);
    }
+
+   NIR_PASS_V(*producer, nir_lower_io_to_vector, nir_var_shader_out);
+   NIR_PASS_V(*consumer, nir_lower_io_to_vector, nir_var_shader_in);
+
+   if ((*producer)->info.stage != MESA_SHADER_TESS_CTRL) {
+      /* Calling lower_io_to_vector creates output variable writes with
+       * write-masks.  On non-TCS outputs, the back-end can't handle it and we
+       * need to call nir_lower_io_to_temporaries to get rid of them.  This,
+       * in turn, creates temporary variables and extra copy_deref intrinsics
+       * that we need to clean up.
+       */
+      NIR_PASS_V(*producer, nir_lower_io_to_temporaries,
+                 nir_shader_get_entrypoint(*producer), true, false);
+      NIR_PASS_V(*producer, nir_lower_global_vars_to_local);
+      NIR_PASS_V(*producer, nir_split_var_copies);
+      NIR_PASS_V(*producer, nir_lower_var_copies);
+   }
 }
 
 /* Prepare the given shader for codegen
@@ -932,7 +960,9 @@ brw_nir_apply_sampler_key(nir_shader *nir,
                           bool is_scalar)
 {
    const struct gen_device_info *devinfo = compiler->devinfo;
-   nir_lower_tex_options tex_options = { 0 };
+   nir_lower_tex_options tex_options = {
+      .lower_txd_clamp_if_sampler_index_not_lt_16 = true,
+   };
 
    /* Iron Lake and prior require lowering of all rectangle textures */
    if (devinfo->gen < 6)
@@ -964,6 +994,10 @@ brw_nir_apply_sampler_key(nir_shader *nir,
    tex_options.lower_xy_uxvx_external = key_tex->xy_uxvx_image_mask;
    tex_options.lower_ayuv_external = key_tex->ayuv_image_mask;
 
+   /* Setup array of scaling factors for each texture. */
+   memcpy(&tex_options.scale_factors, &key_tex->scale_factors,
+          sizeof(tex_options.scale_factors));
+
    if (nir_lower_tex(nir, &tex_options)) {
       nir_validate_shader(nir, "after nir_lower_tex");
       nir = brw_nir_optimize(nir, compiler, is_scalar, false);
diff --git a/src/intel/compiler/brw_vec4.cpp b/src/intel/compiler/brw_vec4.cpp
index 4489c682d01..785508f1e3f 100644
--- a/src/intel/compiler/brw_vec4.cpp
+++ b/src/intel/compiler/brw_vec4.cpp
@@ -1160,6 +1160,12 @@ vec4_instruction::can_reswizzle(const struct gen_device_info *devinfo,
    if (devinfo->gen == 6 && is_math() && swizzle != BRW_SWIZZLE_XYZW)
       return false;
 
+   /* If we write to the flag register changing the swizzle would change
+    * what channels are written to the flag register.
+    */
+   if (writes_flag())
+      return false;
+
    /* We can't swizzle implicit accumulator access.  We'd have to
     * reswizzle the producer of the accumulator value in addition
     * to the consumer (i.e. both MUL and MACH).  Just skip this.
diff --git a/src/intel/compiler/brw_vec4_cmod_propagation.cpp b/src/intel/compiler/brw_vec4_cmod_propagation.cpp
index 760327d559d..a7a3bb8fb06 100644
--- a/src/intel/compiler/brw_vec4_cmod_propagation.cpp
+++ b/src/intel/compiler/brw_vec4_cmod_propagation.cpp
@@ -173,19 +173,19 @@ opt_cmod_propagation_local(bblock_t *block, vec4_visitor *v)
 
                   /* Given a sequence like:
                    *
-                   *    cmp.ge.f0(8)  g21<1>.xF      g20<4>.xF      g18<4>.xF
+                   *    cmp.ge.f0(8)  g21<1>.zF      g20<4>.xF      g18<4>.xF
                    *    ...
-                   *    cmp.nz.f0(8)  null<1>D       g21<4>.xD      0D
+                   *    cmp.nz.f0(8)  null<1>D       g21<4>.zD      0D
                    *
                    * Replace it with something like:
                    *
-                   *    cmp.ge.f0(8)  g22<1>F        g20<4>.xF      g18<4>.xF
-                   *    mov(8)        g21<1>.xF      g22<1>.xxxxF
+                   *    cmp.ge.f0(8)  g22<1>.zF      g20<4>.xF      g18<4>.xF
+                   *    mov(8)        g21<1>.xF      g22<1>.zzzzF
                    *
                    * The added MOV will most likely be removed later.  In the
                    * worst case, it should be cheaper to schedule.
                    */
-                  temp.swizzle = inst->src[0].swizzle;
+                  temp.swizzle = brw_swizzle_for_mask(inst->dst.writemask);
                   temp.type = scan_inst->src[0].type;
 
                   vec4_instruction *mov = v->MOV(scan_inst->dst, temp);
diff --git a/src/intel/compiler/test_fs_cmod_propagation.cpp b/src/intel/compiler/test_fs_cmod_propagation.cpp
index 659fbb2d1bc..4215af1fb02 100644
--- a/src/intel/compiler/test_fs_cmod_propagation.cpp
+++ b/src/intel/compiler/test_fs_cmod_propagation.cpp
@@ -889,3 +889,35 @@ TEST_F(cmod_propagation_test, subtract_delete_compare_derp)
    EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 1)->opcode);
    EXPECT_EQ(BRW_PREDICATE_NORMAL, instruction(block0, 1)->predicate);
 }
+
+TEST_F(cmod_propagation_test, signed_unsigned_comparison_mismatch)
+{
+   const fs_builder &bld = v->bld;
+   fs_reg dest0 = v->vgrf(glsl_type::int_type);
+   fs_reg src0 = v->vgrf(glsl_type::int_type);
+   src0.type = BRW_REGISTER_TYPE_W;
+
+   bld.ASR(dest0, negate(src0), brw_imm_d(15));
+   bld.CMP(bld.null_reg_ud(), retype(dest0, BRW_REGISTER_TYPE_UD),
+           brw_imm_ud(0u), BRW_CONDITIONAL_LE);
+
+   /* = Before =
+    * 0: asr(8)          dest:D   -src0:W 15D
+    * 1: cmp.le.f0(8)    null:UD  dest:UD 0UD
+    *
+    * = After =
+    * (no changes)
+    */
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ASR, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_LE, instruction(block0, 1)->conditional_mod);
+}
diff --git a/src/intel/dev/gen_device_info.c b/src/intel/dev/gen_device_info.c
index 5dbd0607572..625ebc031dc 100644
--- a/src/intel/dev/gen_device_info.c
+++ b/src/intel/dev/gen_device_info.c
@@ -414,6 +414,7 @@ static const struct gen_device_info gen_device_info_hsw_gt3 = {
    .has_64bit_types = true,                         \
    .supports_simd16_3src = true,                    \
    .has_surface_tile_offset = true,                 \
+   .num_thread_per_eu = 7,                          \
    .max_vs_threads = 504,                           \
    .max_tcs_threads = 504,                          \
    .max_tes_threads = 504,                          \
@@ -427,7 +428,6 @@ static const struct gen_device_info gen_device_info_bdw_gt1 = {
    .num_slices = 1,
    .num_subslices = { 2, },
    .num_eu_per_subslice = 8,
-   .num_thread_per_eu = 7,
    .l3_banks = 2,
    .max_cs_threads = 42,
    .urb = {
@@ -452,7 +452,6 @@ static const struct gen_device_info gen_device_info_bdw_gt2 = {
    .num_slices = 1,
    .num_subslices = { 3, },
    .num_eu_per_subslice = 8,
-   .num_thread_per_eu = 7,
    .l3_banks = 4,
    .max_cs_threads = 56,
    .urb = {
@@ -477,7 +476,6 @@ static const struct gen_device_info gen_device_info_bdw_gt3 = {
    .num_slices = 2,
    .num_subslices = { 3, 3, },
    .num_eu_per_subslice = 8,
-   .num_thread_per_eu = 7,
    .l3_banks = 8,
    .max_cs_threads = 56,
    .urb = {
@@ -503,7 +501,6 @@ static const struct gen_device_info gen_device_info_chv = {
    .num_slices = 1,
    .num_subslices = { 2, },
    .num_eu_per_subslice = 8,
-   .num_thread_per_eu = 7,
    .l3_banks = 2,
    .max_vs_threads = 80,
    .max_tcs_threads = 80,
@@ -609,8 +606,7 @@ static const struct gen_device_info gen_device_info_chv = {
 #define GEN9_FEATURES                               \
    GEN8_FEATURES,                                   \
    GEN9_HW_INFO,                                    \
-   .has_sample_with_hiz = true,                     \
-   .num_thread_per_eu = 7
+   .has_sample_with_hiz = true
 
 static const struct gen_device_info gen_device_info_skl_gt1 = {
    GEN9_FEATURES, .gt = 1,
@@ -777,6 +773,7 @@ static const struct gen_device_info gen_device_info_cfl_gt1 = {
    .num_subslices = { 2, },
    .num_eu_per_subslice = 6,
    .l3_banks = 2,
+   .urb.size = 192,
    .simulator_id = 24,
 };
 static const struct gen_device_info gen_device_info_cfl_gt2 = {
diff --git a/src/intel/genxml/gen10.xml b/src/intel/genxml/gen10.xml
index 284633aedd4..4cb1f05ae25 100644
--- a/src/intel/genxml/gen10.xml
+++ b/src/intel/genxml/gen10.xml
@@ -2043,7 +2043,10 @@
       <value name="AALINEDISTANCE_TRUE" value="1"/>
     </field>
     <field name="Smooth Point Enable" start="109" end="109" type="bool"/>
-    <field name="Vertex Sub Pixel Precision Select" start="108" end="108" type="uint"/>
+    <field name="Vertex Sub Pixel Precision Select" start="108" end="108" type="uint">
+      <value name="8 Bit" value="0"/>
+      <value name="4 Bit" value="1"/>
+    </field>
     <field name="Point Width Source" start="107" end="107" type="uint">
       <value name="Vertex" value="0"/>
       <value name="State" value="1"/>
diff --git a/src/intel/genxml/gen11.xml b/src/intel/genxml/gen11.xml
index 95a84a2f597..a7c06c5ab60 100644
--- a/src/intel/genxml/gen11.xml
+++ b/src/intel/genxml/gen11.xml
@@ -2063,7 +2063,10 @@
       <value name="AALINEDISTANCE_TRUE" value="1"/>
     </field>
     <field name="Smooth Point Enable" start="109" end="109" type="bool"/>
-    <field name="Vertex Sub Pixel Precision Select" start="108" end="108" type="uint"/>
+    <field name="Vertex Sub Pixel Precision Select" start="108" end="108" type="uint">
+      <value name="8 Bit" value="0"/>
+      <value name="4 Bit" value="1"/>
+    </field>
     <field name="Point Width Source" start="107" end="107" type="uint">
       <value name="Vertex" value="0"/>
       <value name="State" value="1"/>
diff --git a/src/intel/genxml/gen7.xml b/src/intel/genxml/gen7.xml
index 363fd8664bf..1b2c7d996f9 100644
--- a/src/intel/genxml/gen7.xml
+++ b/src/intel/genxml/gen7.xml
@@ -1399,7 +1399,10 @@
     <field name="AA Line Distance Mode" start="110" end="110" type="uint">
       <value name="AALINEDISTANCE_TRUE" value="1"/>
     </field>
-    <field name="Vertex Sub Pixel Precision Select" start="108" end="108" type="uint"/>
+    <field name="Vertex Sub Pixel Precision Select" start="108" end="108" type="uint">
+      <value name="8 Bit" value="0"/>
+      <value name="4 Bit" value="1"/>
+    </field>
     <field name="Point Width Source" start="107" end="107" type="uint">
       <value name="Vertex" value="0"/>
       <value name="State" value="1"/>
diff --git a/src/intel/genxml/gen75.xml b/src/intel/genxml/gen75.xml
index a1da9cae041..95b306139eb 100644
--- a/src/intel/genxml/gen75.xml
+++ b/src/intel/genxml/gen75.xml
@@ -1713,7 +1713,10 @@
     <field name="AA Line Distance Mode" start="110" end="110" type="uint">
       <value name="AALINEDISTANCE_TRUE" value="1"/>
     </field>
-    <field name="Vertex Sub Pixel Precision Select" start="108" end="108" type="uint"/>
+    <field name="Vertex Sub Pixel Precision Select" start="108" end="108" type="uint">
+      <value name="8 Bit" value="0"/>
+      <value name="4 Bit" value="1"/>
+    </field>
     <field name="Point Width Source" start="107" end="107" type="uint">
       <value name="Vertex" value="0"/>
       <value name="State" value="1"/>
diff --git a/src/intel/genxml/gen8.xml b/src/intel/genxml/gen8.xml
index 4676d9bca9c..0226d7c0c66 100644
--- a/src/intel/genxml/gen8.xml
+++ b/src/intel/genxml/gen8.xml
@@ -1816,7 +1816,10 @@
       <value name="AALINEDISTANCE_TRUE" value="1"/>
     </field>
     <field name="Smooth Point Enable" start="109" end="109" type="bool"/>
-    <field name="Vertex Sub Pixel Precision Select" start="108" end="108" type="uint"/>
+    <field name="Vertex Sub Pixel Precision Select" start="108" end="108" type="uint">
+      <value name="8 Bit" value="0"/>
+      <value name="4 Bit" value="1"/>
+    </field>
     <field name="Point Width Source" start="107" end="107" type="uint">
       <value name="Vertex" value="0"/>
       <value name="State" value="1"/>
diff --git a/src/intel/genxml/gen9.xml b/src/intel/genxml/gen9.xml
index 8afa986df55..88fc2da7885 100644
--- a/src/intel/genxml/gen9.xml
+++ b/src/intel/genxml/gen9.xml
@@ -1995,7 +1995,10 @@
       <value name="AALINEDISTANCE_TRUE" value="1"/>
     </field>
     <field name="Smooth Point Enable" start="109" end="109" type="bool"/>
-    <field name="Vertex Sub Pixel Precision Select" start="108" end="108" type="uint"/>
+    <field name="Vertex Sub Pixel Precision Select" start="108" end="108" type="uint">
+      <value name="8 Bit" value="0"/>
+      <value name="4 Bit" value="1"/>
+    </field>
     <field name="Point Width Source" start="107" end="107" type="uint">
       <value name="Vertex" value="0"/>
       <value name="State" value="1"/>
diff --git a/src/intel/genxml/meson.build b/src/intel/genxml/meson.build
index d0c982d0f8b..343b4fcc45f 100644
--- a/src/intel/genxml/meson.build
+++ b/src/intel/genxml/meson.build
@@ -57,3 +57,5 @@ foreach f : gen_xml_files
     capture : true,
   )
 endforeach
+
+idep_genxml = declare_dependency(sources : [gen_xml_pack, genX_bits_h, genX_xml_h])
diff --git a/src/intel/meson.build b/src/intel/meson.build
index 3c57e79d325..a5bb03e314a 100644
--- a/src/intel/meson.build
+++ b/src/intel/meson.build
@@ -21,9 +21,9 @@
 c_sse2_args = ['-msse2', '-mstackrealign']
 inc_intel = include_directories('.')
 
+subdir('genxml')
 subdir('blorp')
 subdir('dev')
-subdir('genxml')
 subdir('isl')
 subdir('common')
 subdir('compiler')
diff --git a/src/intel/vulkan/anv_allocator.c b/src/intel/vulkan/anv_allocator.c
index 006175c8c65..e9cc5764924 100644
--- a/src/intel/vulkan/anv_allocator.c
+++ b/src/intel/vulkan/anv_allocator.c
@@ -165,7 +165,7 @@ anv_state_table_init(struct anv_state_table *table,
       goto fail_fd;
    }
 
-   if (!u_vector_init(&table->mmap_cleanups,
+   if (!u_vector_init(&table->cleanups,
                       round_to_power_of_two(sizeof(struct anv_state_table_cleanup)),
                       128)) {
       result = vk_error(VK_ERROR_INITIALIZATION_FAILED);
@@ -179,12 +179,12 @@ anv_state_table_init(struct anv_state_table *table,
    uint32_t initial_size = initial_entries * ANV_STATE_ENTRY_SIZE;
    result = anv_state_table_expand_range(table, initial_size);
    if (result != VK_SUCCESS)
-      goto fail_mmap_cleanups;
+      goto fail_cleanups;
 
    return VK_SUCCESS;
 
- fail_mmap_cleanups:
-   u_vector_finish(&table->mmap_cleanups);
+ fail_cleanups:
+   u_vector_finish(&table->cleanups);
  fail_fd:
    close(table->fd);
 
@@ -195,7 +195,7 @@ static VkResult
 anv_state_table_expand_range(struct anv_state_table *table, uint32_t size)
 {
    void *map;
-   struct anv_mmap_cleanup *cleanup;
+   struct anv_state_table_cleanup *cleanup;
 
    /* Assert that we only ever grow the pool */
    assert(size >= table->state.end);
@@ -204,11 +204,11 @@ anv_state_table_expand_range(struct anv_state_table *table, uint32_t size)
    if (size > BLOCK_POOL_MEMFD_SIZE)
       return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
 
-   cleanup = u_vector_add(&table->mmap_cleanups);
+   cleanup = u_vector_add(&table->cleanups);
    if (!cleanup)
       return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
 
-   *cleanup = ANV_MMAP_CLEANUP_INIT;
+   *cleanup = ANV_STATE_TABLE_CLEANUP_INIT;
 
    /* Just leak the old map until we destroy the pool.  We can't munmap it
     * without races or imposing locking on the block allocate fast path. On
@@ -272,12 +272,12 @@ anv_state_table_finish(struct anv_state_table *table)
 {
    struct anv_state_table_cleanup *cleanup;
 
-   u_vector_foreach(cleanup, &table->mmap_cleanups) {
+   u_vector_foreach(cleanup, &table->cleanups) {
       if (cleanup->map)
          munmap(cleanup->map, cleanup->size);
    }
 
-   u_vector_finish(&table->mmap_cleanups);
+   u_vector_finish(&table->cleanups);
 
    close(table->fd);
 }
diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c
index 53303e0e745..60d332c33b6 100644
--- a/src/intel/vulkan/anv_cmd_buffer.c
+++ b/src/intel/vulkan/anv_cmd_buffer.c
@@ -128,8 +128,13 @@ static void
 anv_cmd_pipeline_state_finish(struct anv_cmd_buffer *cmd_buffer,
                               struct anv_cmd_pipeline_state *pipe_state)
 {
-   for (uint32_t i = 0; i < ARRAY_SIZE(pipe_state->push_descriptors); i++)
-      vk_free(&cmd_buffer->pool->alloc, pipe_state->push_descriptors[i]);
+   for (uint32_t i = 0; i < ARRAY_SIZE(pipe_state->push_descriptors); i++) {
+      if (pipe_state->push_descriptors[i]) {
+         anv_descriptor_set_layout_unref(cmd_buffer->device,
+             pipe_state->push_descriptors[i]->set.layout);
+         vk_free(&cmd_buffer->pool->alloc, pipe_state->push_descriptors[i]);
+      }
+   }
 }
 
 static void
@@ -957,10 +962,11 @@ anv_cmd_buffer_get_depth_stencil_view(const struct anv_cmd_buffer *cmd_buffer)
    return iview;
 }
 
-static struct anv_push_descriptor_set *
-anv_cmd_buffer_get_push_descriptor_set(struct anv_cmd_buffer *cmd_buffer,
-                                       VkPipelineBindPoint bind_point,
-                                       uint32_t set)
+static struct anv_descriptor_set *
+anv_cmd_buffer_push_descriptor_set(struct anv_cmd_buffer *cmd_buffer,
+                                   VkPipelineBindPoint bind_point,
+                                   struct anv_descriptor_set_layout *layout,
+                                   uint32_t _set)
 {
    struct anv_cmd_pipeline_state *pipe_state;
    if (bind_point == VK_PIPELINE_BIND_POINT_COMPUTE) {
@@ -971,19 +977,31 @@ anv_cmd_buffer_get_push_descriptor_set(struct anv_cmd_buffer *cmd_buffer,
    }
 
    struct anv_push_descriptor_set **push_set =
-      &pipe_state->push_descriptors[set];
+      &pipe_state->push_descriptors[_set];
 
    if (*push_set == NULL) {
-      *push_set = vk_alloc(&cmd_buffer->pool->alloc,
-                           sizeof(struct anv_push_descriptor_set), 8,
-                           VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+      *push_set = vk_zalloc(&cmd_buffer->pool->alloc,
+                            sizeof(struct anv_push_descriptor_set), 8,
+                            VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
       if (*push_set == NULL) {
          anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY);
          return NULL;
       }
    }
 
-   return *push_set;
+   struct anv_descriptor_set *set = &(*push_set)->set;
+
+   if (set->layout != layout) {
+      if (set->layout)
+         anv_descriptor_set_layout_unref(cmd_buffer->device, set->layout);
+      anv_descriptor_set_layout_ref(layout);
+      set->layout = layout;
+   }
+   set->size = anv_descriptor_set_layout_size(layout);
+   set->buffer_count = layout->buffer_count;
+   set->buffer_views = (*push_set)->buffer_views;
+
+   return set;
 }
 
 void anv_CmdPushDescriptorSetKHR(
@@ -1001,19 +1019,12 @@ void anv_CmdPushDescriptorSetKHR(
 
    struct anv_descriptor_set_layout *set_layout = layout->set[_set].layout;
 
-   struct anv_push_descriptor_set *push_set =
-      anv_cmd_buffer_get_push_descriptor_set(cmd_buffer,
-                                             pipelineBindPoint, _set);
-   if (!push_set)
+   struct anv_descriptor_set *set =
+      anv_cmd_buffer_push_descriptor_set(cmd_buffer, pipelineBindPoint,
+                                         set_layout, _set);
+   if (!set)
       return;
 
-   struct anv_descriptor_set *set = &push_set->set;
-
-   set->layout = set_layout;
-   set->size = anv_descriptor_set_layout_size(set_layout);
-   set->buffer_count = set_layout->buffer_count;
-   set->buffer_views = push_set->buffer_views;
-
    /* Go through the user supplied descriptors. */
    for (uint32_t i = 0; i < descriptorWriteCount; i++) {
       const VkWriteDescriptorSet *write = &pDescriptorWrites[i];
@@ -1093,19 +1104,12 @@ void anv_CmdPushDescriptorSetWithTemplateKHR(
 
    struct anv_descriptor_set_layout *set_layout = layout->set[_set].layout;
 
-   struct anv_push_descriptor_set *push_set =
-      anv_cmd_buffer_get_push_descriptor_set(cmd_buffer,
-                                             template->bind_point, _set);
-   if (!push_set)
+   struct anv_descriptor_set *set =
+      anv_cmd_buffer_push_descriptor_set(cmd_buffer, template->bind_point,
+                                         set_layout, _set);
+   if (!set)
       return;
 
-   struct anv_descriptor_set *set = &push_set->set;
-
-   set->layout = set_layout;
-   set->size = anv_descriptor_set_layout_size(set_layout);
-   set->buffer_count = set_layout->buffer_count;
-   set->buffer_views = push_set->buffer_views;
-
    anv_descriptor_set_write_template(set,
                                      cmd_buffer->device,
                                      &cmd_buffer->surface_state_stream,
diff --git a/src/intel/vulkan/anv_descriptor_set.c b/src/intel/vulkan/anv_descriptor_set.c
index a4e466cf3dd..0259abea0bf 100644
--- a/src/intel/vulkan/anv_descriptor_set.c
+++ b/src/intel/vulkan/anv_descriptor_set.c
@@ -58,6 +58,9 @@ void anv_GetDescriptorSetLayoutSupport(
                anv_foreach_stage(s, binding->stageFlags)
                   surface_count[s] += sampler->n_planes;
             }
+         } else {
+            anv_foreach_stage(s, binding->stageFlags)
+               surface_count[s] += binding->descriptorCount;
          }
          break;
 
@@ -70,10 +73,10 @@ void anv_GetDescriptorSetLayoutSupport(
 
    bool supported = true;
    for (unsigned s = 0; s < MESA_SHADER_STAGES; s++) {
-      /* Our maximum binding table size is 250 and we need to reserve 8 for
-       * render targets.  240 is a nice round number.
+      /* Our maximum binding table size is 240 and we need to reserve 8 for
+       * render targets.
        */
-      if (surface_count[s] >= 240)
+      if (surface_count[s] >= MAX_BINDING_TABLE_SIZE - MAX_RTS)
          supported = false;
    }
 
@@ -458,6 +461,8 @@ VkResult anv_CreateDescriptorPool(
                          &device->surface_state_pool, 4096);
    pool->surface_state_free_list = NULL;
 
+   list_inithead(&pool->desc_sets);
+
    *pDescriptorPool = anv_descriptor_pool_to_handle(pool);
 
    return VK_SUCCESS;
@@ -474,7 +479,13 @@ void anv_DestroyDescriptorPool(
    if (!pool)
       return;
 
+   list_for_each_entry_safe(struct anv_descriptor_set, set,
+                            &pool->desc_sets, pool_link) {
+      anv_descriptor_set_destroy(device, pool, set);
+   }
+
    anv_state_stream_finish(&pool->surface_state_stream);
+
    vk_free2(&device->alloc, pAllocator, pool);
 }
 
@@ -486,6 +497,11 @@ VkResult anv_ResetDescriptorPool(
    ANV_FROM_HANDLE(anv_device, device, _device);
    ANV_FROM_HANDLE(anv_descriptor_pool, pool, descriptorPool);
 
+   list_for_each_entry_safe(struct anv_descriptor_set, set,
+                            &pool->desc_sets, pool_link) {
+      anv_descriptor_set_destroy(device, pool, set);
+   }
+
    pool->next = 0;
    pool->free_list = EMPTY;
    anv_state_stream_finish(&pool->surface_state_stream);
@@ -630,6 +646,8 @@ anv_descriptor_set_destroy(struct anv_device *device,
       entry->size = set->size;
       pool->free_list = (char *) entry - pool->data;
    }
+
+   list_del(&set->pool_link);
 }
 
 VkResult anv_AllocateDescriptorSets(
@@ -652,6 +670,8 @@ VkResult anv_AllocateDescriptorSets(
       if (result != VK_SUCCESS)
          break;
 
+      list_addtail(&set->pool_link, &pool->desc_sets);
+
       pDescriptorSets[i] = anv_descriptor_set_to_handle(set);
    }
 
diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index f44b046cf5d..99b512a0387 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -128,6 +128,8 @@ anv_physical_device_init_heaps(struct anv_physical_device *device, int fd)
        */
       device->memory.heap_count = 1;
       device->memory.heaps[0] = (struct anv_memory_heap) {
+         .vma_start = LOW_HEAP_MIN_ADDRESS,
+         .vma_size = LOW_HEAP_SIZE,
          .size = heap_size,
          .flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
          .supports_48bit_addresses = false,
@@ -147,11 +149,19 @@ anv_physical_device_init_heaps(struct anv_physical_device *device, int fd)
 
       device->memory.heap_count = 2;
       device->memory.heaps[0] = (struct anv_memory_heap) {
+         .vma_start = HIGH_HEAP_MIN_ADDRESS,
+         /* Leave the last 4GiB out of the high vma range, so that no state
+          * base address + size can overflow 48 bits. For more information see
+          * the comment about Wa32bitGeneralStateOffset in anv_allocator.c
+          */
+         .vma_size = gtt_size - (1ull << 32) - HIGH_HEAP_MIN_ADDRESS,
          .size = heap_size_48bit,
          .flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
          .supports_48bit_addresses = true,
       };
       device->memory.heaps[1] = (struct anv_memory_heap) {
+         .vma_start = LOW_HEAP_MIN_ADDRESS,
+         .vma_size = LOW_HEAP_SIZE,
          .size = heap_size_32bit,
          .flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
          .supports_48bit_addresses = false,
@@ -1029,7 +1039,7 @@ void anv_GetPhysicalDeviceProperties(
       .maxPerStageDescriptorSampledImages       = max_samplers,
       .maxPerStageDescriptorStorageImages       = max_images,
       .maxPerStageDescriptorInputAttachments    = 64,
-      .maxPerStageResources                     = 250,
+      .maxPerStageResources                     = MAX_BINDING_TABLE_SIZE - MAX_RTS,
       .maxDescriptorSetSamplers                 = 6 * max_samplers, /* number of stages * maxPerStageDescriptorSamplers */
       .maxDescriptorSetUniformBuffers           = 6 * 64,           /* number of stages * maxPerStageDescriptorUniformBuffers */
       .maxDescriptorSetUniformBuffersDynamic    = MAX_DYNAMIC_BUFFERS / 2,
@@ -1068,7 +1078,7 @@ void anv_GetPhysicalDeviceProperties(
          16 * devinfo->max_cs_threads,
          16 * devinfo->max_cs_threads,
       },
-      .subPixelPrecisionBits                    = 4 /* FIXME */,
+      .subPixelPrecisionBits                    = 8,
       .subTexelPrecisionBits                    = 4 /* FIXME */,
       .mipmapPrecisionBits                      = 4 /* FIXME */,
       .maxDrawIndexedIndexValue                 = UINT32_MAX,
@@ -1806,18 +1816,16 @@ VkResult anv_CreateDevice(
       }
 
       /* keep the page with address zero out of the allocator */
-      util_vma_heap_init(&device->vma_lo, LOW_HEAP_MIN_ADDRESS, LOW_HEAP_SIZE);
-      device->vma_lo_available =
-         physical_device->memory.heaps[physical_device->memory.heap_count - 1].size;
-
-      /* Leave the last 4GiB out of the high vma range, so that no state base
-       * address + size can overflow 48 bits. For more information see the
-       * comment about Wa32bitGeneralStateOffset in anv_allocator.c
-       */
-      util_vma_heap_init(&device->vma_hi, HIGH_HEAP_MIN_ADDRESS,
-                         HIGH_HEAP_SIZE);
+      struct anv_memory_heap *low_heap =
+         &physical_device->memory.heaps[physical_device->memory.heap_count - 1];
+      util_vma_heap_init(&device->vma_lo, low_heap->vma_start, low_heap->vma_size);
+      device->vma_lo_available = low_heap->size;
+
+      struct anv_memory_heap *high_heap =
+         &physical_device->memory.heaps[0];
+      util_vma_heap_init(&device->vma_hi, high_heap->vma_start, high_heap->vma_size);
       device->vma_hi_available = physical_device->memory.heap_count == 1 ? 0 :
-         physical_device->memory.heaps[0].size;
+         high_heap->size;
    }
 
    /* As per spec, the driver implementation may deny requests to acquire
@@ -1866,7 +1874,7 @@ VkResult anv_CreateDevice(
       result = vk_error(VK_ERROR_INITIALIZATION_FAILED);
       goto fail_mutex;
    }
-   if (pthread_cond_init(&device->queue_submit, NULL) != 0) {
+   if (pthread_cond_init(&device->queue_submit, &condattr) != 0) {
       pthread_condattr_destroy(&condattr);
       result = vk_error(VK_ERROR_INITIALIZATION_FAILED);
       goto fail_mutex;
@@ -2276,8 +2284,11 @@ anv_vma_free(struct anv_device *device, struct anv_bo *bo)
       util_vma_heap_free(&device->vma_lo, addr_48b, bo->size);
       device->vma_lo_available += bo->size;
    } else {
-      assert(addr_48b >= HIGH_HEAP_MIN_ADDRESS &&
-             addr_48b <= HIGH_HEAP_MAX_ADDRESS);
+      MAYBE_UNUSED const struct anv_physical_device *physical_device =
+         &device->instance->physicalDevice;
+      assert(addr_48b >= physical_device->memory.heaps[0].vma_start &&
+             addr_48b < (physical_device->memory.heaps[0].vma_start +
+                         physical_device->memory.heaps[0].vma_size));
       util_vma_heap_free(&device->vma_hi, addr_48b, bo->size);
       device->vma_hi_available += bo->size;
    }
diff --git a/src/intel/vulkan/anv_extensions.py b/src/intel/vulkan/anv_extensions.py
index 22bad94e5b8..577ed111a2a 100644
--- a/src/intel/vulkan/anv_extensions.py
+++ b/src/intel/vulkan/anv_extensions.py
@@ -128,14 +128,15 @@ def __init__(self, version, enable):
               'device->has_context_priority'),
     Extension('VK_EXT_pci_bus_info',                      2, True),
     Extension('VK_EXT_post_depth_coverage',               1, 'device->info.gen >= 9'),
+    Extension('VK_EXT_queue_family_foreign',              1, 'ANDROID'),
     Extension('VK_EXT_sampler_filter_minmax',             1, 'device->info.gen >= 9'),
-    Extension('VK_EXT_scalar_block_layout',               1, True),
+    Extension('VK_EXT_scalar_block_layout',               1, '!ANDROID'),
     Extension('VK_EXT_shader_viewport_index_layer',       1, True),
     Extension('VK_EXT_shader_stencil_export',             1, 'device->info.gen >= 9'),
     Extension('VK_EXT_transform_feedback',                1, True),
     Extension('VK_EXT_vertex_attribute_divisor',          3, True),
-    Extension('VK_GOOGLE_decorate_string',                1, True),
-    Extension('VK_GOOGLE_hlsl_functionality1',            1, True),
+    Extension('VK_GOOGLE_decorate_string',                1, '!ANDROID'),
+    Extension('VK_GOOGLE_hlsl_functionality1',            1, '!ANDROID'),
 ]
 
 class VkVersion:
diff --git a/src/intel/vulkan/anv_intel.c b/src/intel/vulkan/anv_intel.c
index 08bff9585bc..f6b9584b410 100644
--- a/src/intel/vulkan/anv_intel.c
+++ b/src/intel/vulkan/anv_intel.c
@@ -64,7 +64,8 @@ VkResult anv_CreateDmaBufImageINTEL(
          .samples = 1,
          /* FIXME: Need a way to use X tiling to allow scanout */
          .tiling = VK_IMAGE_TILING_OPTIMAL,
-         .usage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
+         .usage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT |
+                  VK_IMAGE_USAGE_SAMPLED_BIT,
          .flags = 0,
       }},
       pAllocator, &image_h);
diff --git a/src/intel/vulkan/anv_nir_lower_ycbcr_textures.c b/src/intel/vulkan/anv_nir_lower_ycbcr_textures.c
index 104c58dc5e2..0567a1be939 100644
--- a/src/intel/vulkan/anv_nir_lower_ycbcr_textures.c
+++ b/src/intel/vulkan/anv_nir_lower_ycbcr_textures.c
@@ -269,6 +269,7 @@ create_plane_tex_instr_implicit(struct ycbcr_state *state,
    tex->texture_index = old_tex->texture_index;
    tex->texture_array_size = old_tex->texture_array_size;
    tex->sampler_index = old_tex->sampler_index;
+   tex->is_array = old_tex->is_array;
 
    nir_ssa_dest_init(&tex->instr, &tex->dest,
                      old_tex->dest.ssa.num_components,
diff --git a/src/intel/vulkan/anv_pass.c b/src/intel/vulkan/anv_pass.c
index 02f2be60e02..a1c72395831 100644
--- a/src/intel/vulkan/anv_pass.c
+++ b/src/intel/vulkan/anv_pass.c
@@ -178,12 +178,28 @@ anv_render_pass_compile(struct anv_render_pass *pass)
     * subpasses and checking to see if any of them don't have an external
     * dependency.  Or, we could just be lazy and add a couple extra flushes.
     * We choose to be lazy.
+    *
+    * From the documentation for vkCmdNextSubpass:
+    *
+    *    "Moving to the next subpass automatically performs any multisample
+    *    resolve operations in the subpass being ended. End-of-subpass
+    *    multisample resolves are treated as color attachment writes for the
+    *    purposes of synchronization. This applies to resolve operations for
+    *    both color and depth/stencil attachments. That is, they are
+    *    considered to execute in the
+    *    VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT pipeline stage and
+    *    their writes are synchronized with
+    *    VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT."
+    *
+    * Therefore, the above flags concerning color attachments also apply to
+    * color and depth/stencil resolve attachments.
     */
    if (all_usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) {
       pass->subpass_flushes[0] |=
          ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
    }
-   if (all_usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) {
+   if (all_usage & (VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT |
+                    VK_IMAGE_USAGE_TRANSFER_DST_BIT)) {
       pass->subpass_flushes[pass->subpass_count] |=
          ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
    }
diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c
index be869cfa061..1bdc896e708 100644
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -377,12 +377,12 @@ populate_wm_prog_key(const struct gen_device_info *devinfo,
        * harmless to compute it and then let dead-code take care of it.
        */
       if (ms_info->rasterizationSamples > 1) {
-         key->persample_interp =
+         key->persample_interp = ms_info->sampleShadingEnable &&
             (ms_info->minSampleShading * ms_info->rasterizationSamples) > 1;
          key->multisample_fbo = true;
       }
 
-      key->frag_coord_adds_sample_pos = ms_info->sampleShadingEnable;
+      key->frag_coord_adds_sample_pos = key->persample_interp;
    }
 }
 
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index 110b2ccf023..9979b832a7b 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -120,12 +120,9 @@ struct gen_l3_config;
 #define INSTRUCTION_STATE_POOL_MIN_ADDRESS 0x000180000000ULL /* 6 GiB */
 #define INSTRUCTION_STATE_POOL_MAX_ADDRESS 0x0001bfffffffULL
 #define HIGH_HEAP_MIN_ADDRESS              0x0001c0000000ULL /* 7 GiB */
-#define HIGH_HEAP_MAX_ADDRESS              0xfffeffffffffULL
 
 #define LOW_HEAP_SIZE               \
    (LOW_HEAP_MAX_ADDRESS - LOW_HEAP_MIN_ADDRESS + 1)
-#define HIGH_HEAP_SIZE              \
-   (HIGH_HEAP_MAX_ADDRESS - HIGH_HEAP_MIN_ADDRESS + 1)
 #define DYNAMIC_STATE_POOL_SIZE     \
    (DYNAMIC_STATE_POOL_MAX_ADDRESS - DYNAMIC_STATE_POOL_MIN_ADDRESS + 1)
 #define BINDING_TABLE_POOL_SIZE     \
@@ -163,6 +160,18 @@ struct gen_l3_config;
 #define MAX_GEN8_IMAGES 8
 #define MAX_PUSH_DESCRIPTORS 32 /* Minimum requirement */
 
+/* From the Skylake PRM Vol. 7 "Binding Table Surface State Model":
+ *
+ *    "The surface state model is used when a Binding Table Index (specified
+ *    in the message descriptor) of less than 240 is specified. In this model,
+ *    the Binding Table Index is used to index into the binding table, and the
+ *    binding table entry contains a pointer to the SURFACE_STATE."
+ *
+ * Binding table values above 240 are used for various things in the hardware
+ * such as stateless, stateless with incoherent cache, SLM, and bindless.
+ */
+#define MAX_BINDING_TABLE_SIZE 240
+
 /* The kernel relocation API has a limitation of a 32-bit delta value
  * applied to the address before it is written which, in spite of it being
  * unsigned, is treated as signed .  Because of the way that this maps to
@@ -733,7 +742,7 @@ struct anv_state_table {
    struct anv_free_entry *map;
    uint32_t size;
    struct anv_block_state state;
-   struct u_vector mmap_cleanups;
+   struct u_vector cleanups;
 };
 
 struct anv_state_pool {
@@ -894,6 +903,8 @@ struct anv_memory_heap {
    VkMemoryHeapFlags flags;
 
    /* Driver-internal book-keeping */
+   uint64_t          vma_start;
+   uint64_t          vma_size;
    bool              supports_48bit_addresses;
 };
 
@@ -1449,10 +1460,10 @@ _anv_combine_address(struct anv_batch *batch, void *location,
  */
 
 /* TC=LLC/eLLC, LeCC=WB, LRUM=3, L3CC=WB */
-#define GEN9_MOCS 2
+#define GEN9_MOCS (2 << 1)
 
 /* TC=LLC/eLLC, LeCC=WB, LRUM=3, L3CC=WB */
-#define GEN9_EXTERNAL_MOCS 1
+#define GEN9_EXTERNAL_MOCS (1 << 1)
 
 /* Cannonlake MOCS defines are duplicates of Skylake MOCS defines. */
 #define GEN10_MOCS GEN9_MOCS
@@ -1581,6 +1592,10 @@ struct anv_descriptor_set {
    uint32_t size;
    uint32_t buffer_count;
    struct anv_buffer_view *buffer_views;
+
+   /* Link to descriptor pool's desc_sets list . */
+   struct list_head pool_link;
+
    struct anv_descriptor descriptors[0];
 };
 
@@ -1614,6 +1629,8 @@ struct anv_descriptor_pool {
    struct anv_state_stream surface_state_stream;
    void *surface_state_free_list;
 
+   struct list_head desc_sets;
+
    char data[0];
 };
 
@@ -3045,7 +3062,13 @@ anv_can_sample_with_hiz(const struct gen_device_info * const devinfo,
    if (!(image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT))
       return false;
 
-   if (devinfo->gen < 8)
+   /* Allow this feature on BDW even though it is disabled in the BDW devinfo
+    * struct. There's documentation which suggests that this feature actually
+    * reduces performance on BDW, but it has only been observed to help so
+    * far. Sampling fast-cleared blocks on BDW must also be handled with care
+    * (see depth_stencil_attachment_compute_aux_usage() for more info).
+    */
+   if (devinfo->gen != 8 && !devinfo->has_sample_with_hiz)
       return false;
 
    return image->samples == 1;
diff --git a/src/intel/vulkan/gen7_cmd_buffer.c b/src/intel/vulkan/gen7_cmd_buffer.c
index 352892aee33..380283bdd56 100644
--- a/src/intel/vulkan/gen7_cmd_buffer.c
+++ b/src/intel/vulkan/gen7_cmd_buffer.c
@@ -70,12 +70,36 @@ gen7_cmd_buffer_emit_scissor(struct anv_cmd_buffer *cmd_buffer)
       };
 
       const int max = 0xffff;
+
+      uint32_t y_min = s->offset.y;
+      uint32_t x_min = s->offset.x;
+      uint32_t y_max = s->offset.y + s->extent.height - 1;
+      uint32_t x_max = s->offset.x + s->extent.width - 1;
+
+      /* Do this math using int64_t so overflow gets clamped correctly. */
+      if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
+         y_min = clamp_int64((uint64_t) y_min,
+                             cmd_buffer->state.render_area.offset.y, max);
+         x_min = clamp_int64((uint64_t) x_min,
+                             cmd_buffer->state.render_area.offset.x, max);
+         y_max = clamp_int64((uint64_t) y_max, 0,
+                             cmd_buffer->state.render_area.offset.y +
+                             cmd_buffer->state.render_area.extent.height - 1);
+         x_max = clamp_int64((uint64_t) x_max, 0,
+                             cmd_buffer->state.render_area.offset.x +
+                             cmd_buffer->state.render_area.extent.width - 1);
+      } else if (fb) {
+         y_min = clamp_int64((uint64_t) y_min, 0, max);
+         x_min = clamp_int64((uint64_t) x_min, 0, max);
+         y_max = clamp_int64((uint64_t) y_max, 0, fb->height - 1);
+         x_max = clamp_int64((uint64_t) x_max, 0, fb->width - 1);
+      }
+
       struct GEN7_SCISSOR_RECT scissor = {
-         /* Do this math using int64_t so overflow gets clamped correctly. */
-         .ScissorRectangleYMin = clamp_int64(s->offset.y, 0, max),
-         .ScissorRectangleXMin = clamp_int64(s->offset.x, 0, max),
-         .ScissorRectangleYMax = clamp_int64((uint64_t) s->offset.y + s->extent.height - 1, 0, fb->height - 1),
-         .ScissorRectangleXMax = clamp_int64((uint64_t) s->offset.x + s->extent.width - 1, 0, fb->width - 1)
+         .ScissorRectangleYMin = y_min,
+         .ScissorRectangleXMin = x_min,
+         .ScissorRectangleYMax = y_max,
+         .ScissorRectangleXMax = x_max
       };
 
       if (s->extent.width <= 0 || s->extent.height <= 0) {
diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c
index d980ec428d0..a3994f5870c 100644
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -2653,7 +2653,7 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer)
          anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) {
             sob.SOBufferIndex = idx;
 
-            if (cmd_buffer->state.xfb_enabled && xfb->buffer) {
+            if (cmd_buffer->state.xfb_enabled && xfb->buffer && xfb->size != 0) {
                sob.SOBufferEnable = true;
                sob.MOCS = cmd_buffer->device->default_mocs,
                sob.StreamOffsetWriteEnable = false;
diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c
index d2142ae42c2..3e13a12d776 100644
--- a/src/intel/vulkan/genX_pipeline.c
+++ b/src/intel/vulkan/genX_pipeline.c
@@ -464,6 +464,7 @@ emit_rs_state(struct anv_pipeline *pipeline,
    sf.TriangleStripListProvokingVertexSelect = 0;
    sf.LineStripListProvokingVertexSelect = 0;
    sf.TriangleFanProvokingVertexSelect = 1;
+   sf.VertexSubPixelPrecisionSelect = _8Bit;
 
    const struct brw_vue_prog_data *last_vue_prog_data =
       anv_pipeline_get_last_vue_prog_data(pipeline);
@@ -1077,6 +1078,10 @@ emit_3dstate_clip(struct anv_pipeline *pipeline,
       clip.APIMode                  = APIMODE_D3D,
       clip.ViewportXYClipTestEnable = true;
 
+#if GEN_GEN >= 8
+      clip.VertexSubPixelPrecisionSelect = _8Bit;
+#endif
+
       clip.ClipMode = CLIPMODE_NORMAL;
 
       clip.TriangleStripListProvokingVertexSelect = 0;
@@ -1211,13 +1216,30 @@ emit_3dstate_streamout(struct anv_pipeline *pipeline,
             hole_dwords -= 4;
          }
 
+         int varying = output->location;
+         uint8_t component_mask = output->component_mask;
+         /* VARYING_SLOT_PSIZ contains three scalar fields packed together:
+          * - VARYING_SLOT_LAYER    in VARYING_SLOT_PSIZ.y
+          * - VARYING_SLOT_VIEWPORT in VARYING_SLOT_PSIZ.z
+          * - VARYING_SLOT_PSIZ     in VARYING_SLOT_PSIZ.w
+          */
+         if (varying == VARYING_SLOT_LAYER) {
+            varying = VARYING_SLOT_PSIZ;
+            component_mask = 1 << 1; // SO_DECL_COMPMASK_Y
+         } else if (varying == VARYING_SLOT_VIEWPORT) {
+            varying = VARYING_SLOT_PSIZ;
+            component_mask = 1 << 2; // SO_DECL_COMPMASK_Z
+         } else if (varying == VARYING_SLOT_PSIZ) {
+            component_mask = 1 << 3; // SO_DECL_COMPMASK_W
+         }
+
          next_offset[buffer] = output->offset +
-                               __builtin_popcount(output->component_mask) * 4;
+                               __builtin_popcount(component_mask) * 4;
 
          so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) {
             .OutputBufferSlot = buffer,
-            .RegisterIndex = vue_map->varying_to_slot[output->location],
-            .ComponentMask = output->component_mask,
+            .RegisterIndex = vue_map->varying_to_slot[varying],
+            .ComponentMask = component_mask,
          };
       }
 
@@ -2065,9 +2087,29 @@ compute_pipeline_create(
       vfe.URBEntryAllocationSize = GEN_GEN <= 7 ? 0 : 2;
       vfe.CURBEAllocationSize    = vfe_curbe_allocation;
 
-      vfe.PerThreadScratchSpace = get_scratch_space(cs_bin);
-      vfe.ScratchSpaceBasePointer =
-         get_scratch_address(pipeline, MESA_SHADER_COMPUTE, cs_bin);
+      if (cs_bin->prog_data->total_scratch) {
+         if (GEN_GEN >= 8) {
+            /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
+             * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
+             */
+            vfe.PerThreadScratchSpace =
+               ffs(cs_bin->prog_data->total_scratch) - 11;
+         } else if (GEN_IS_HASWELL) {
+            /* Haswell's Per Thread Scratch Space is in the range [0, 10]
+             * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
+             */
+            vfe.PerThreadScratchSpace =
+               ffs(cs_bin->prog_data->total_scratch) - 12;
+         } else {
+            /* IVB and BYT use the range [0, 11] to mean [1kB, 12kB]
+             * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
+             */
+            vfe.PerThreadScratchSpace =
+               cs_bin->prog_data->total_scratch / 1024 - 1;
+         }
+         vfe.ScratchSpaceBasePointer =
+            get_scratch_address(pipeline, MESA_SHADER_COMPUTE, cs_bin);
+      }
    }
 
    struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
diff --git a/src/intel/vulkan/genX_query.c b/src/intel/vulkan/genX_query.c
index 794d92dc6c9..6c1c76aeef0 100644
--- a/src/intel/vulkan/genX_query.c
+++ b/src/intel/vulkan/genX_query.c
@@ -356,14 +356,23 @@ emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
 }
 
 static void
-emit_query_availability(struct anv_cmd_buffer *cmd_buffer,
-                        struct anv_address addr)
+emit_query_mi_availability(struct anv_cmd_buffer *cmd_buffer,
+                           struct anv_address addr,
+                           bool available)
+{
+   genX(cmd_buffer_mi_memset)(cmd_buffer, addr, available, 8);
+}
+
+static void
+emit_query_pc_availability(struct anv_cmd_buffer *cmd_buffer,
+                           struct anv_address addr,
+                           bool available)
 {
    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
       pc.DestinationAddressType  = DAT_PPGTT;
       pc.PostSyncOperation       = WriteImmediateData;
       pc.Address                 = addr;
-      pc.ImmediateData           = 1;
+      pc.ImmediateData           = available;
    }
 }
 
@@ -376,12 +385,40 @@ emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
                   struct anv_query_pool *pool,
                   uint32_t first_index, uint32_t num_queries)
 {
-   for (uint32_t i = 0; i < num_queries; i++) {
-      struct anv_address slot_addr =
-         anv_query_address(pool, first_index + i);
-      genX(cmd_buffer_mi_memset)(cmd_buffer, anv_address_add(slot_addr, 8),
-                                 0, pool->stride - 8);
-      emit_query_availability(cmd_buffer, slot_addr);
+   switch (pool->type) {
+   case VK_QUERY_TYPE_OCCLUSION:
+   case VK_QUERY_TYPE_TIMESTAMP:
+      /* These queries are written with a PIPE_CONTROL so clear them using the
+       * PIPE_CONTROL as well so we don't have to synchronize between 2 types
+       * of operations.
+       */
+      assert((pool->stride % 8) == 0);
+      for (uint32_t i = 0; i < num_queries; i++) {
+         struct anv_address slot_addr =
+            anv_query_address(pool, first_index + i);
+
+         for (uint32_t qword = 1; qword < (pool->stride / 8); qword++) {
+            emit_query_pc_availability(cmd_buffer,
+                                       anv_address_add(slot_addr, qword * 8),
+                                       false);
+         }
+         emit_query_pc_availability(cmd_buffer, slot_addr, true);
+      }
+      break;
+
+   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
+   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
+      for (uint32_t i = 0; i < num_queries; i++) {
+         struct anv_address slot_addr =
+            anv_query_address(pool, first_index + i);
+         genX(cmd_buffer_mi_memset)(cmd_buffer, anv_address_add(slot_addr, 8),
+                                    0, pool->stride - 8);
+         emit_query_mi_availability(cmd_buffer, slot_addr, true);
+      }
+      break;
+
+   default:
+      unreachable("Unsupported query type");
    }
 }
 
@@ -394,11 +431,28 @@ void genX(CmdResetQueryPool)(
    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
 
-   for (uint32_t i = 0; i < queryCount; i++) {
-      anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdm) {
-         sdm.Address = anv_query_address(pool, firstQuery + i);
-         sdm.ImmediateData = 0;
+   switch (pool->type) {
+   case VK_QUERY_TYPE_OCCLUSION:
+   case VK_QUERY_TYPE_TIMESTAMP:
+      for (uint32_t i = 0; i < queryCount; i++) {
+         emit_query_pc_availability(cmd_buffer,
+                                    anv_query_address(pool, firstQuery + i),
+                                    false);
+      }
+      break;
+
+   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
+   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
+      for (uint32_t i = 0; i < queryCount; i++) {
+         emit_query_mi_availability(cmd_buffer,
+                                    anv_query_address(pool, firstQuery + i),
+                                    false);
       }
+      break;
+   }
+
+   default:
+      unreachable("Unsupported query type");
    }
 }
 
@@ -511,9 +565,9 @@ void genX(CmdBeginQueryIndexedEXT)(
 void genX(CmdEndQuery)(
     VkCommandBuffer                             commandBuffer,
     VkQueryPool                                 queryPool,
-    VkQueryControlFlags                         flags)
+    uint32_t                                    query)
 {
-   genX(CmdEndQueryIndexedEXT)(commandBuffer, queryPool, flags, 0);
+   genX(CmdEndQueryIndexedEXT)(commandBuffer, queryPool, query, 0);
 }
 
 void genX(CmdEndQueryIndexedEXT)(
@@ -529,7 +583,7 @@ void genX(CmdEndQueryIndexedEXT)(
    switch (pool->type) {
    case VK_QUERY_TYPE_OCCLUSION:
       emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 16));
-      emit_query_availability(cmd_buffer, query_addr);
+      emit_query_pc_availability(cmd_buffer, query_addr, true);
       break;
 
    case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
@@ -548,7 +602,7 @@ void genX(CmdEndQueryIndexedEXT)(
          offset += 16;
       }
 
-      emit_query_availability(cmd_buffer, query_addr);
+      emit_query_mi_availability(cmd_buffer, query_addr, true);
       break;
    }
 
@@ -559,7 +613,7 @@ void genX(CmdEndQueryIndexedEXT)(
       }
 
       emit_xfb_query(cmd_buffer, index, anv_address_add(query_addr, 16));
-      emit_query_availability(cmd_buffer, query_addr);
+      emit_query_mi_availability(cmd_buffer, query_addr, true);
       break;
 
    default:
@@ -614,7 +668,7 @@ void genX(CmdWriteTimestamp)(
       break;
    }
 
-   emit_query_availability(cmd_buffer, query_addr);
+   emit_query_pc_availability(cmd_buffer, query_addr, true);
 
    /* When multiview is active the spec requires that N consecutive query
     * indices are used, where N is the number of active views in the subpass.
@@ -817,7 +871,20 @@ void genX(CmdCopyQueryPoolResults)(
    }
 
    if ((flags & VK_QUERY_RESULT_WAIT_BIT) ||
-       (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS)) {
+       (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) ||
+       /* Occlusion & timestamp queries are written using a PIPE_CONTROL and
+        * because we're about to copy values from MI commands, we need to
+        * stall the command streamer to make sure the PIPE_CONTROL values have
+        * landed, otherwise we could see inconsistent values & availability.
+        *
+        *  From the vulkan spec:
+        *
+        *     "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
+        *     previous uses of vkCmdResetQueryPool in the same queue, without
+        *     any additional synchronization."
+        */
+       pool->type == VK_QUERY_TYPE_OCCLUSION ||
+       pool->type == VK_QUERY_TYPE_TIMESTAMP) {
       cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT;
       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
    }
diff --git a/src/intel/vulkan/meson.build b/src/intel/vulkan/meson.build
index 05fdeca8c25..af1223ad3b9 100644
--- a/src/intel/vulkan/meson.build
+++ b/src/intel/vulkan/meson.build
@@ -1,4 +1,4 @@
-# Copyright © 2017-2018 Intel Corporation
+# Copyright © 2017-2019 Intel Corporation
 
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -105,7 +105,7 @@ foreach g : [['70', ['gen7_cmd_buffer.c']], ['75', ['gen7_cmd_buffer.c']],
       c_vis_args, no_override_init_args, c_sse2_args,
       '-DGEN_VERSIONx10=@0@'.format(_gen),
     ],
-    dependencies : [dep_libdrm, dep_valgrind, idep_nir_headers],
+    dependencies : [dep_libdrm, dep_valgrind, idep_nir_headers, idep_genxml],
   )
 endforeach
 
@@ -178,7 +178,10 @@ endif
 
 libanv_common = static_library(
   'anv_common',
-  [libanv_files, anv_entrypoints, anv_extensions_c, anv_extensions_h, sha1_h],
+  [
+    libanv_files, anv_entrypoints, anv_extensions_c, anv_extensions_h, sha1_h,
+    gen_xml_pack,
+  ],
   include_directories : [
     inc_common, inc_intel, inc_compiler, inc_drm_uapi, inc_vulkan_util,
     inc_vulkan_wsi,
@@ -200,7 +203,7 @@ libvulkan_intel = shared_library(
     libvulkan_util, libvulkan_wsi, libmesa_util,
   ],
   dependencies : [
-    dep_thread, dep_dl, dep_m, anv_deps, idep_nir,
+    dep_thread, dep_dl, dep_m, anv_deps, idep_nir, idep_genxml,
   ],
   c_args : anv_flags,
   link_args : ['-Wl,--build-id=sha1', ld_args_bsymbolic, ld_args_gc_sections],
diff --git a/src/loader/loader_dri3_helper.c b/src/loader/loader_dri3_helper.c
index ad9b9d87b05..7d61c1df4fc 100644
--- a/src/loader/loader_dri3_helper.c
+++ b/src/loader/loader_dri3_helper.c
@@ -111,7 +111,7 @@ set_adaptive_sync_property(xcb_connection_t *conn, xcb_drawable_t drawable,
    xcb_intern_atom_reply_t* reply;
    xcb_void_cookie_t check;
 
-   cookie = xcb_intern_atom(conn, 0, sizeof(name), name);
+   cookie = xcb_intern_atom(conn, 0, strlen(name), name);
    reply = xcb_intern_atom_reply(conn, cookie, NULL);
    if (reply == NULL)
       return;
diff --git a/src/mapi/es1api/meson.build b/src/mapi/es1api/meson.build
index b0416e705a1..14ca49c1407 100644
--- a/src/mapi/es1api/meson.build
+++ b/src/mapi/es1api/meson.build
@@ -38,7 +38,7 @@ libglesv1_cm = shared_library(
   include_directories : [inc_src, inc_include, inc_mapi],
   link_with : libglapi,
   dependencies : [dep_thread, dep_libdrm, dep_m, dep_dl],
-  version : '1.0.0',
+  version : '1.1.0',
   install : true,
 )
 
diff --git a/src/mesa/drivers/dri/Android.mk b/src/mesa/drivers/dri/Android.mk
index 53ff4b4f632..60c8476a38a 100644
--- a/src/mesa/drivers/dri/Android.mk
+++ b/src/mesa/drivers/dri/Android.mk
@@ -49,11 +49,19 @@ MESA_DRI_WHOLE_STATIC_LIBRARIES := \
 MESA_DRI_SHARED_LIBRARIES := \
 	libcutils \
 	libdl \
-	libexpat \
 	libglapi \
 	liblog \
 	libz
 
+# If Android version >=8 MESA should static link libexpat else should dynamic link
+ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0)
+MESA_DRI_WHOLE_STATIC_LIBRARIES += \
+	libexpat
+else
+MESA_DRI_SHARED_LIBRARIES += \
+	libexpat
+endif
+
 #-----------------------------------------------
 # Build drivers and libmesa_dri_common
 
diff --git a/src/mesa/drivers/dri/i965/Android.mk b/src/mesa/drivers/dri/i965/Android.mk
index 1574c8834c9..97def8f03fe 100644
--- a/src/mesa/drivers/dri/i965/Android.mk
+++ b/src/mesa/drivers/dri/i965/Android.mk
@@ -274,6 +274,8 @@ LOCAL_LDFLAGS += $(MESA_DRI_LDFLAGS)
 LOCAL_CFLAGS := \
 	$(MESA_DRI_CFLAGS)
 
+LOCAL_CFLAGS += -Wno-error
+
 LOCAL_C_INCLUDES := \
 	$(MESA_DRI_C_INCLUDES) \
 	$(call generated-sources-dir-for,STATIC_LIBRARIES,libmesa_glsl,,) \
diff --git a/src/mesa/drivers/dri/i965/Makefile.am b/src/mesa/drivers/dri/i965/Makefile.am
index b562c6ea21c..0bda2897e8e 100644
--- a/src/mesa/drivers/dri/i965/Makefile.am
+++ b/src/mesa/drivers/dri/i965/Makefile.am
@@ -34,6 +34,8 @@ AM_CFLAGS = \
 	-I$(top_builddir)/src/util \
 	-I$(top_srcdir)/src/mesa/drivers/dri/common \
 	-I$(top_srcdir)/src/gtest/include \
+	-I$(top_builddir)/src/compiler \
+	-I$(top_srcdir)/src/compiler \
 	-I$(top_builddir)/src/compiler/glsl \
 	-I$(top_builddir)/src/compiler/nir \
 	-I$(top_srcdir)/src/compiler/nir \
diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.c b/src/mesa/drivers/dri/i965/brw_bufmgr.c
index f1675b191c1..43077e60da4 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.c
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.c
@@ -402,6 +402,8 @@ vma_alloc(struct brw_bufmgr *bufmgr,
    /* Without softpin support, we let the kernel assign addresses. */
    assert(brw_using_softpin(bufmgr));
 
+   alignment = ALIGN(alignment, PAGE_SIZE);
+
    struct bo_cache_bucket *bucket = get_bucket_allocator(bufmgr, size);
    uint64_t addr;
 
@@ -1487,7 +1489,7 @@ brw_bo_gem_export_to_prime(struct brw_bo *bo, int *prime_fd)
    brw_bo_make_external(bo);
 
    if (drmPrimeHandleToFD(bufmgr->fd, bo->gem_handle,
-                          DRM_CLOEXEC, prime_fd) != 0)
+			  DRM_CLOEXEC | DRM_RDWR, prime_fd) != 0)
       return -errno;
 
    bo->reusable = false;
@@ -1717,6 +1719,9 @@ brw_bufmgr_init(struct gen_device_info *devinfo, int fd)
 
    const uint64_t _4GB = 4ull << 30;
 
+   /* The STATE_BASE_ADDRESS size field can only hold 1 page shy of 4GB */
+   const uint64_t _4GB_minus_1 = _4GB - PAGE_SIZE;
+
    if (devinfo->gen >= 8 && gtt_size > _4GB) {
       bufmgr->initial_kflags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
 
@@ -1726,9 +1731,13 @@ brw_bufmgr_init(struct gen_device_info *devinfo, int fd)
          bufmgr->initial_kflags |= EXEC_OBJECT_PINNED;
 
          util_vma_heap_init(&bufmgr->vma_allocator[BRW_MEMZONE_LOW_4G],
-                            PAGE_SIZE, _4GB);
+                            PAGE_SIZE, _4GB_minus_1);
+
+         /* Leave the last 4GB out of the high vma range, so that no state
+          * base address + size can overflow 48 bits.
+          */
          util_vma_heap_init(&bufmgr->vma_allocator[BRW_MEMZONE_OTHER],
-                            1 * _4GB, gtt_size - 1 * _4GB);
+                            1 * _4GB, gtt_size - 2 * _4GB);
       } else if (devinfo->gen >= 10) {
          /* Softpin landed in 4.5, but GVT used an aliasing PPGTT until
           * kernel commit 6b3816d69628becb7ff35978aa0751798b4a940a in
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 505da9896b3..e4bc5fe99f0 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -893,6 +893,19 @@ brw_process_driconf_options(struct brw_context *brw)
    ctx->Const.dri_config_options_sha1 = ralloc_array(brw, unsigned char, 20);
    driComputeOptionsSha1(&brw->screen->optionCache,
                          ctx->Const.dri_config_options_sha1);
+
+   brw->screen->compiler->simd32_heuristics_control.grouped_sends_check =
+      driQueryOptionb(&brw->optionCache, "simd32_heuristic_grouped_check");
+   brw->screen->compiler->simd32_heuristics_control.max_grouped_sends =
+      driQueryOptioni(&brw->optionCache, "simd32_heuristic_grouped_sends");
+   brw->screen->compiler->simd32_heuristics_control.inst_count_check =
+      driQueryOptionb(&brw->optionCache, "simd32_heuristic_inst_check");
+   brw->screen->compiler->simd32_heuristics_control.inst_count_ratio =
+      driQueryOptionf(&brw->optionCache, "simd32_heuristic_inst_ratio");
+   brw->screen->compiler->simd32_heuristics_control.mrt_check =
+      driQueryOptionb(&brw->optionCache, "simd32_heuristic_mrt_check");
+   brw->screen->compiler->simd32_heuristics_control.max_mrts =
+      driQueryOptioni(&brw->optionCache, "simd32_heuristic_max_mrts");
 }
 
 GLboolean
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 66fe5b3a8a0..7237f39d286 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -686,6 +686,7 @@ enum brw_query_kind {
    OA_COUNTERS,
    OA_COUNTERS_RAW,
    PIPELINE_STATS,
+   NULL_RENDERER,
 };
 
 struct brw_perf_query_register_prog {
@@ -1006,6 +1007,9 @@ struct brw_context
 
       /* High bits of the last seen index buffer address (for workarounds). */
       uint16_t last_bo_high_bits;
+
+      /* Used to understand is GPU state of primitive restart is up to date */
+      bool enable_cut_index;
    } ib;
 
    /* Active vertex program:
@@ -1246,6 +1250,7 @@ struct brw_context
 
       int n_active_oa_queries;
       int n_active_pipeline_stats_queries;
+      int n_active_null_renderers;
 
       /* The number of queries depending on running OA counters which
        * extends beyond brw_end_perf_query() since we need to wait until
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 2729a54e144..cdfa435a1f5 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -1652,11 +1652,17 @@ enum brw_pixel_shader_coverage_mask_mode {
 #define GEN10_CACHE_MODE_SS            0x0e420
 #define GEN10_FLOAT_BLEND_OPTIMIZATION_ENABLE (1 << 4)
 
-#define INSTPM                             0x20c0
+#define INSTPM                             0x20c0 /* Gen6-8 */
 # define INSTPM_CONSTANT_BUFFER_ADDRESS_OFFSET_DISABLE (1 << 6)
+# define INSTPM_GLOBAL_DEBUG_ENABLE                    (1 << 4)
+# define INSTPM_MEDIA_INSTRUCTION_DISABLE              (1 << 3)
+# define INSTPM_3D_RENDERER_INSTRUCTION_DISABLE        (1 << 2)
+# define INSTPM_3D_STATE_INSTRUCTION_DISABLE           (1 << 1)
 
 #define CS_DEBUG_MODE2                     0x20d8 /* Gen9+ */
 # define CSDBG2_CONSTANT_BUFFER_ADDRESS_OFFSET_DISABLE (1 << 4)
+# define CSDBG2_MEDIA_INSTRUCTION_DISABLE              (1 << 1)
+# define CSDBG2_3D_RENDERER_INSTRUCTION_DISABLE        (1 << 0)
 
 #define GEN7_RPSTAT1                       0xA01C
 #define  GEN7_RPSTAT1_CURR_GT_FREQ_SHIFT   7
diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c
index dfbc45fe938..2f52899fcb0 100644
--- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
@@ -776,6 +776,14 @@ brw_upload_indices(struct brw_context *brw)
       brw->ib.index_size = index_buffer->index_size;
       brw->ctx.NewDriverState |= BRW_NEW_INDEX_BUFFER;
    }
+
+   /* We need to re-emit an index buffer state each time
+    * when cut index flag is changed
+    */
+   if (brw->prim_restart.enable_cut_index != brw->ib.enable_cut_index) {
+      brw->ib.enable_cut_index = brw->prim_restart.enable_cut_index;
+      brw->ctx.NewDriverState |= BRW_NEW_INDEX_BUFFER;
+   }
 }
 
 const struct brw_tracked_state brw_indices = {
diff --git a/src/mesa/drivers/dri/i965/brw_link.cpp b/src/mesa/drivers/dri/i965/brw_link.cpp
index 2cbb1e0b879..95d87dc56fd 100644
--- a/src/mesa/drivers/dri/i965/brw_link.cpp
+++ b/src/mesa/drivers/dri/i965/brw_link.cpp
@@ -323,7 +323,6 @@ brw_link_shader(struct gl_context *ctx, struct gl_shader_program *shProg)
 
       brw_shader_gather_info(prog->nir, prog);
 
-      NIR_PASS_V(prog->nir, gl_nir_lower_samplers, shProg);
       NIR_PASS_V(prog->nir, gl_nir_lower_atomics, shProg, false);
       NIR_PASS_V(prog->nir, nir_lower_atomics_to_ssbo,
                  prog->nir->info.num_abos);
diff --git a/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp b/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp
index 256fdd8fc79..7e2a5b045dd 100644
--- a/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp
+++ b/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp
@@ -80,15 +80,15 @@ setup_vec4_image_param(uint32_t *params, uint32_t idx,
 }
 
 static void
-brw_setup_image_uniform_values(gl_shader_stage stage,
-                               struct brw_stage_prog_data *stage_prog_data,
-                               unsigned param_start_index,
-                               const gl_uniform_storage *storage)
+brw_setup_image_uniform_values(nir_variable *var,
+                               struct brw_stage_prog_data *prog_data)
 {
-   uint32_t *param = &stage_prog_data->param[param_start_index];
+   unsigned param_start_index = var->data.driver_location / 4;
+   uint32_t *param = &prog_data->param[param_start_index];
+   unsigned num_images = MAX2(1, var->type->arrays_of_arrays_size());
 
-   for (unsigned i = 0; i < MAX2(storage->array_elements, 1); i++) {
-      const unsigned image_idx = storage->opaque[stage].index + i;
+   for (unsigned i = 0; i < num_images; i++) {
+      const unsigned image_idx = var->data.binding + i;
 
       /* Upload the brw_image_param structure.  The order is expected to match
        * the BRW_IMAGE_PARAM_*_OFFSET defines.
@@ -150,6 +150,14 @@ brw_nir_setup_glsl_uniform(gl_shader_stage stage, nir_variable *var,
                            struct brw_stage_prog_data *stage_prog_data,
                            bool is_scalar)
 {
+   if (var->type->without_array()->is_sampler())
+      return;
+
+   if (var->type->without_array()->is_image()) {
+      brw_setup_image_uniform_values(var, stage_prog_data);
+      return;
+   }
+
    /* The data for our (non-builtin) uniforms is stored in a series of
     * gl_uniform_storage structs for each subcomponent that
     * glGetUniformLocation() could name.  We know it's been set up in the same
@@ -162,15 +170,17 @@ brw_nir_setup_glsl_uniform(gl_shader_stage stage, nir_variable *var,
       struct gl_uniform_storage *storage =
          &prog->sh.data->UniformStorage[var->data.location + u];
 
-      if (storage->builtin || storage->type->is_sampler())
+      /* We already handled samplers and images via the separate top-level
+       * variables created by gl_nir_lower_samplers_as_deref(), but they're
+       * still part of the structure's storage, and so we'll see them while
+       * walking it to set up the other regular fields.  Just skip over them.
+       */
+      if (storage->builtin ||
+          storage->type->is_sampler() ||
+          storage->type->is_image())
          continue;
 
-      if (storage->type->is_image()) {
-         brw_setup_image_uniform_values(stage, stage_prog_data,
-                                        uniform_index, storage);
-         uniform_index +=
-            BRW_IMAGE_PARAM_SIZE * MAX2(storage->array_elements, 1);
-      } else {
+      {
          gl_constant_value *components = storage->storage;
          unsigned vector_count = (MAX2(storage->array_elements, 1) *
                                   storage->type->matrix_columns);
diff --git a/src/mesa/drivers/dri/i965/brw_performance_query.c b/src/mesa/drivers/dri/i965/brw_performance_query.c
index 10e3d024f17..85d14a83c7e 100644
--- a/src/mesa/drivers/dri/i965/brw_performance_query.c
+++ b/src/mesa/drivers/dri/i965/brw_performance_query.c
@@ -330,6 +330,12 @@ dump_perf_query_callback(GLuint id, void *query_void, void *brw_void)
           o->Active ? "Active," : (o->Ready ? "Ready," : "Pending,"),
           obj->pipeline_stats.bo ? "yes" : "no");
       break;
+   case NULL_RENDERER:
+      DBG("%4d: %-6s %-8s NULL_RENDERER\n",
+          id,
+          o->Used ? "Dirty," : "New,",
+          o->Active ? "Active," : (o->Ready ? "Ready," : "Pending,"));
+      break;
    default:
       unreachable("Unknown query type");
       break;
@@ -431,6 +437,10 @@ brw_get_perf_query_info(struct gl_context *ctx,
       *n_active = brw->perfquery.n_active_pipeline_stats_queries;
       break;
 
+   case NULL_RENDERER:
+      *n_active = brw->perfquery.n_active_null_renderers;
+      break;
+
    default:
       unreachable("Unknown query type");
       break;
@@ -1020,6 +1030,7 @@ brw_begin_perf_query(struct gl_context *ctx,
    struct brw_context *brw = brw_context(ctx);
    struct brw_perf_query_object *obj = brw_perf_query(o);
    const struct brw_perf_query_info *query = obj->query;
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
 
    /* We can assume the frontend hides mistaken attempts to Begin a
     * query object multiple times before its End. Similarly if an
@@ -1104,7 +1115,6 @@ brw_begin_perf_query(struct gl_context *ctx,
       /* If the OA counters aren't already on, enable them. */
       if (brw->perfquery.oa_stream_fd == -1) {
          __DRIscreen *screen = brw->screen->driScrnPriv;
-         const struct gen_device_info *devinfo = &brw->screen->devinfo;
 
          /* The period_exponent gives a sampling period as follows:
           *   sample_period = timestamp_period * 2^(period_exponent + 1)
@@ -1250,6 +1260,23 @@ brw_begin_perf_query(struct gl_context *ctx,
       ++brw->perfquery.n_active_pipeline_stats_queries;
       break;
 
+   case NULL_RENDERER:
+      ++brw->perfquery.n_active_null_renderers;
+      if (devinfo->gen >= 9) {
+         brw_load_register_imm32(brw, CS_DEBUG_MODE2,
+                                 REG_MASK(CSDBG2_3D_RENDERER_INSTRUCTION_DISABLE) |
+                                 CSDBG2_3D_RENDERER_INSTRUCTION_DISABLE);
+      } else {
+         brw_load_register_imm32(brw, INSTPM,
+                                 REG_MASK(INSTPM_3D_RENDERER_INSTRUCTION_DISABLE |
+                                          INSTPM_MEDIA_INSTRUCTION_DISABLE) |
+                                 INSTPM_3D_RENDERER_INSTRUCTION_DISABLE |
+                                 INSTPM_MEDIA_INSTRUCTION_DISABLE);
+      }
+      brw_emit_pipe_control_flush(brw,
+                                  PIPE_CONTROL_LRI_WRITE_IMMEDIATE);
+      break;
+
    default:
       unreachable("Unknown query type");
       break;
@@ -1270,6 +1297,7 @@ brw_end_perf_query(struct gl_context *ctx,
 {
    struct brw_context *brw = brw_context(ctx);
    struct brw_perf_query_object *obj = brw_perf_query(o);
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
 
    DBG("End(%d)\n", o->Id);
 
@@ -1312,6 +1340,21 @@ brw_end_perf_query(struct gl_context *ctx,
       --brw->perfquery.n_active_pipeline_stats_queries;
       break;
 
+   case NULL_RENDERER:
+      if (--brw->perfquery.n_active_null_renderers == 0) {
+         if (devinfo->gen >= 9) {
+            brw_load_register_imm32(brw, CS_DEBUG_MODE2,
+                                    REG_MASK(CSDBG2_3D_RENDERER_INSTRUCTION_DISABLE));
+         } else {
+            brw_load_register_imm32(brw, INSTPM,
+                                    REG_MASK(INSTPM_3D_RENDERER_INSTRUCTION_DISABLE |
+                                             INSTPM_MEDIA_INSTRUCTION_DISABLE));
+         }
+         brw_emit_pipe_control_flush(brw,
+                                     PIPE_CONTROL_LRI_WRITE_IMMEDIATE);
+      }
+      break;
+
    default:
       unreachable("Unknown query type");
       break;
@@ -1337,6 +1380,9 @@ brw_wait_perf_query(struct gl_context *ctx, struct gl_perf_query_object *o)
       bo = obj->pipeline_stats.bo;
       break;
 
+   case NULL_RENDERER:
+      break;
+
    default:
       unreachable("Unknown query type");
       break;
@@ -1387,6 +1433,8 @@ brw_is_perf_query_ready(struct gl_context *ctx,
       return (obj->pipeline_stats.bo &&
               !brw_batch_references(&brw->batch, obj->pipeline_stats.bo) &&
               !brw_bo_busy(obj->pipeline_stats.bo));
+   case NULL_RENDERER:
+      return true;
 
    default:
       unreachable("Unknown query type");
@@ -1602,6 +1650,9 @@ brw_get_perf_query_data(struct gl_context *ctx,
       written = get_pipeline_stats_data(brw, obj, data_size, (uint8_t *)data);
       break;
 
+   case NULL_RENDERER:
+      break;
+
    default:
       unreachable("Unknown query type");
       break;
@@ -1672,6 +1723,9 @@ brw_delete_perf_query(struct gl_context *ctx,
       }
       break;
 
+   case NULL_RENDERER:
+      break;
+
    default:
       unreachable("Unknown query type");
       break;
@@ -2152,6 +2206,15 @@ get_register_queries_function(const struct gen_device_info *devinfo)
    return NULL;
 }
 
+static void
+fill_null_renderer_perf_query_info(struct brw_context *brw,
+                                   struct brw_perf_query_info *query)
+{
+   query->kind = NULL_RENDERER;
+   query->name = "Intel_Null_Hardware_Query";
+   query->n_counters = 0;
+}
+
 static unsigned
 brw_init_perf_query_info(struct gl_context *ctx)
 {
@@ -2210,6 +2273,10 @@ brw_init_perf_query_info(struct gl_context *ctx)
          enumerate_sysfs_metrics(brw);
 
       brw_perf_query_register_mdapi_oa_query(brw);
+
+      struct brw_perf_query_info *null_query =
+         brw_perf_query_append_query_info(brw);
+      fill_null_renderer_perf_query_info(brw, null_query);
    }
 
    brw->perfquery.unaccumulated =
diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
index 9ab25cf664c..841b7df896d 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -42,7 +42,8 @@
 #include "compiler/glsl/ir.h"
 #include "compiler/glsl/program.h"
 #include "compiler/glsl/glsl_to_nir.h"
-#include "compiler/glsl/float64_glsl.h"
+#include "compiler/glsl/gl_nir.h"
+#include "glsl/float64_glsl.h"
 
 #include "brw_program.h"
 #include "brw_context.h"
@@ -165,6 +166,9 @@ brw_create_nir(struct brw_context *brw,
 
    nir = brw_preprocess_nir(brw->screen->compiler, nir);
 
+   if (shader_prog)
+      NIR_PASS_V(nir, gl_nir_lower_samplers, shader_prog);
+
    NIR_PASS_V(nir, brw_nir_lower_image_load_store, devinfo);
 
    if (stage == MESA_SHADER_TESS_CTRL) {
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index 7bbb6166344..9f88d625d63 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -309,6 +309,7 @@ brw_populate_sampler_prog_key_data(struct gl_context *ctx,
       const int s = u_bit_scan(&mask);
 
       key->swizzles[s] = SWIZZLE_NOOP;
+      key->scale_factors[s] = 0.0f;
 
       int unit_id = prog->SamplerUnits[s];
       const struct gl_texture_unit *unit = &ctx->Texture.Unit[unit_id];
@@ -406,6 +407,10 @@ brw_populate_sampler_prog_key_data(struct gl_context *ctx,
          }
 
          if (t->Target == GL_TEXTURE_EXTERNAL_OES && intel_tex->planar_format) {
+
+            /* Setup possible scaling factor. */
+            key->scale_factors[s] = intel_tex->planar_format->scaling_factor;
+
             switch (intel_tex->planar_format->components) {
             case __DRI_IMAGE_COMPONENTS_Y_UV:
                key->y_uv_image_mask |= 1 << s;
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index b067a174056..8269056c74c 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -1681,6 +1681,11 @@ brw_upload_cs_work_groups_surface(struct brw_context *brw)
                                     ISL_FORMAT_RAW,
                                     3 * sizeof(GLuint), 1,
                                     RELOC_WRITE);
+
+      /* The state buffer now holds a reference to our upload, drop ours. */
+      if (bo != brw->compute.num_work_groups_bo)
+         brw_bo_unreference(bo);
+
       brw->ctx.NewDriverState |= BRW_NEW_SURFACES;
    }
 }
diff --git a/src/mesa/drivers/dri/i965/genX_state_upload.c b/src/mesa/drivers/dri/i965/genX_state_upload.c
index dcdfb3c9292..73c983ce742 100644
--- a/src/mesa/drivers/dri/i965/genX_state_upload.c
+++ b/src/mesa/drivers/dri/i965/genX_state_upload.c
@@ -998,7 +998,8 @@ genX(emit_index_buffer)(struct brw_context *brw)
 
    brw_batch_emit(brw, GENX(3DSTATE_INDEX_BUFFER), ib) {
 #if GEN_GEN < 8 && !GEN_IS_HASWELL
-      ib.CutIndexEnable = brw->prim_restart.enable_cut_index;
+      assert(brw->ib.enable_cut_index == brw->prim_restart.enable_cut_index);
+      ib.CutIndexEnable = brw->ib.enable_cut_index;
 #endif
       ib.IndexFormat = brw_get_index_type(index_buffer->index_size);
 
@@ -2445,7 +2446,7 @@ set_scissor_bits(const struct gl_context *ctx, int i,
 
    bbox[0] = MAX2(ctx->ViewportArray[i].X, 0);
    bbox[1] = MIN2(bbox[0] + ctx->ViewportArray[i].Width, fb_width);
-   bbox[2] = MAX2(ctx->ViewportArray[i].Y, 0);
+   bbox[2] = CLAMP(ctx->ViewportArray[i].Y, 0, fb_height);
    bbox[3] = MIN2(bbox[2] + ctx->ViewportArray[i].Height, fb_height);
    _mesa_intersect_scissor_bounding_box(ctx, i, bbox);
 
diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c
index 7513d15c3dd..92ecd612006 100644
--- a/src/mesa/drivers/dri/i965/intel_extensions.c
+++ b/src/mesa/drivers/dri/i965/intel_extensions.c
@@ -182,14 +182,16 @@ intelInitExtensions(struct gl_context *ctx)
       ctx->Extensions.ARB_conditional_render_inverted = true;
       ctx->Extensions.ARB_cull_distance = true;
       ctx->Extensions.ARB_draw_buffers_blend = true;
-      if (ctx->API != API_OPENGL_COMPAT)
+      if (ctx->API != API_OPENGL_COMPAT ||
+          ctx->Const.AllowHigherCompatVersion)
          ctx->Extensions.ARB_enhanced_layouts = true;
       ctx->Extensions.ARB_ES3_compatibility = true;
       ctx->Extensions.ARB_fragment_layer_viewport = true;
       ctx->Extensions.ARB_pipeline_statistics_query = true;
       ctx->Extensions.ARB_sample_shading = true;
       ctx->Extensions.ARB_shading_language_420pack = true;
-      if (ctx->API != API_OPENGL_COMPAT) {
+      if (ctx->API != API_OPENGL_COMPAT ||
+          ctx->Const.AllowHigherCompatVersion) {
          ctx->Extensions.ARB_texture_buffer_object = true;
          ctx->Extensions.ARB_texture_buffer_object_rgb32 = true;
          ctx->Extensions.ARB_texture_buffer_range = true;
@@ -199,7 +201,8 @@ intelInitExtensions(struct gl_context *ctx)
       ctx->Extensions.ARB_texture_multisample = true;
       ctx->Extensions.ARB_uniform_buffer_object = true;
 
-      if (ctx->API != API_OPENGL_COMPAT)
+      if (ctx->API != API_OPENGL_COMPAT ||
+          ctx->Const.AllowHigherCompatVersion)
          ctx->Extensions.AMD_vertex_shader_layer = true;
       ctx->Extensions.EXT_framebuffer_multisample = true;
       ctx->Extensions.EXT_framebuffer_multisample_blit_scaled = true;
@@ -228,7 +231,8 @@ intelInitExtensions(struct gl_context *ctx)
       ctx->Extensions.ARB_conservative_depth = true;
       ctx->Extensions.ARB_derivative_control = true;
       ctx->Extensions.ARB_framebuffer_no_attachments = true;
-      if (ctx->API != API_OPENGL_COMPAT) {
+      if (ctx->API != API_OPENGL_COMPAT ||
+          ctx->Const.AllowHigherCompatVersion) {
          ctx->Extensions.ARB_gpu_shader5 = true;
          ctx->Extensions.ARB_gpu_shader_fp64 = true;
       }
@@ -239,7 +243,8 @@ intelInitExtensions(struct gl_context *ctx)
       ctx->Extensions.ARB_shader_image_size = true;
       ctx->Extensions.ARB_shader_precision = true;
       ctx->Extensions.ARB_shader_texture_image_samples = true;
-      if (ctx->API != API_OPENGL_COMPAT)
+      if (ctx->API != API_OPENGL_COMPAT ||
+          ctx->Const.AllowHigherCompatVersion)
          ctx->Extensions.ARB_tessellation_shader = true;
       ctx->Extensions.ARB_texture_compression_bptc = true;
       ctx->Extensions.ARB_texture_view = true;
@@ -248,7 +253,6 @@ intelInitExtensions(struct gl_context *ctx)
       ctx->Extensions.EXT_shader_samples_identical = true;
       ctx->Extensions.OES_primitive_bounding_box = true;
       ctx->Extensions.OES_texture_buffer = true;
-      ctx->Extensions.ARB_fragment_shader_interlock = true;
 
       if (can_do_pipelined_register_writes(brw->screen)) {
          ctx->Extensions.ARB_draw_indirect = true;
@@ -313,6 +317,30 @@ intelInitExtensions(struct gl_context *ctx)
       ctx->Extensions.KHR_blend_equation_advanced_coherent = true;
       ctx->Extensions.KHR_texture_compression_astc_ldr = true;
       ctx->Extensions.KHR_texture_compression_astc_sliced_3d = true;
+
+      /*
+       * From the Skylake PRM Vol. 7 (Memory Fence Message, page 221):
+       *  "A memory fence message issued by a thread causes further messages
+       *   issued by the thread to be blocked until all previous data port
+       *   messages have completed, or the results can be globally observed from
+       *   the point of view of other threads in the system."
+       *
+       * From the Haswell PRM Vol. 7 (Memory Fence, page 256):
+       *  "A memory fence message issued by a thread causes further messages
+       *   issued by the thread to be blocked until all previous messages issued
+       *   by the thread to that data port (data cache or render cache) have
+       *   been globally observed from the point of view of other threads in the
+       *   system."
+       *
+       * Summarized: For ARB_fragment_shader_interlock to work, we need to
+       * ensure memory access ordering for all messages to the dataport from
+       * all threads. Memory fence messages prior to SKL only provide memory
+       * access ordering for messages from the same thread, so we can only
+       * support the feature from Gen9 onwards.
+       *
+       */
+
+      ctx->Extensions.ARB_fragment_shader_interlock = true;
    }
 
    if (gen_device_info_is_9lp(devinfo))
@@ -321,7 +349,8 @@ intelInitExtensions(struct gl_context *ctx)
    if (devinfo->gen >= 6)
       ctx->Extensions.INTEL_performance_query = true;
 
-   if (ctx->API != API_OPENGL_COMPAT)
+   if (ctx->API != API_OPENGL_COMPAT ||
+       ctx->Const.AllowHigherCompatVersion)
       ctx->Extensions.ARB_base_instance = true;
    if (ctx->API != API_OPENGL_CORE)
       ctx->Extensions.ARB_color_buffer_float = true;
diff --git a/src/mesa/drivers/dri/i965/intel_image.h b/src/mesa/drivers/dri/i965/intel_image.h
index ca604159dc2..4ab8a49b8bb 100644
--- a/src/mesa/drivers/dri/i965/intel_image.h
+++ b/src/mesa/drivers/dri/i965/intel_image.h
@@ -62,6 +62,7 @@ struct intel_image_format {
       uint32_t dri_format;
       int cpp;
    } planes[3];
+   float scaling_factor;
 };
 
 struct __DRIimageRec {
diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
index 8838f977bb6..2436f48a065 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -61,6 +61,33 @@ DRI_CONF_BEGIN
 	    DRI_CONF_ENUM(1, "Enable reuse of all sizes of buffer objects")
 	 DRI_CONF_DESC_END
       DRI_CONF_OPT_END
+
+      DRI_CONF_OPT_BEGIN_B(simd32_heuristic_grouped_check, "true")
+              DRI_CONF_DESC(en, "Enable/disable grouped texture fetch "
+                            "check in the SIMD32 selection heuristic.")
+      DRI_CONF_OPT_END
+      DRI_CONF_OPT_BEGIN_V(simd32_heuristic_grouped_sends, int, 6, "1:999")
+             DRI_CONF_DESC(en, "How many grouped texture fetches should "
+                            "the SIMD32 selection heuristic allow.")
+      DRI_CONF_OPT_END
+      DRI_CONF_OPT_BEGIN_B(simd32_heuristic_inst_check, "true")
+              DRI_CONF_DESC(en, "Enable/disable SIMD32/SIMD16 instruction "
+                            "count ratio check in the SIMD32 selection "
+                            "heuristic.")
+      DRI_CONF_OPT_END
+      DRI_CONF_OPT_BEGIN_V(simd32_heuristic_inst_ratio, float, 2.3, "1:999")
+              DRI_CONF_DESC(en, "SIMD32/SIMD16 instruction count ratio "
+                            "the SIMD32 selection heuristic should allow.")
+      DRI_CONF_OPT_END
+      DRI_CONF_OPT_BEGIN_B(simd32_heuristic_mrt_check, "true")
+              DRI_CONF_DESC(en, "Enable/disable MRT write check in the "
+                            "SIMD32 selection heuristic.")
+      DRI_CONF_OPT_END
+      DRI_CONF_OPT_BEGIN_V(simd32_heuristic_max_mrts, int, 1, "1:8")
+              DRI_CONF_DESC(en, "How many MRT writes should the SIMD32 "
+                            "selection heuristic allow.")
+      DRI_CONF_OPT_END
+
       DRI_CONF_MESA_NO_ERROR("false")
    DRI_CONF_SECTION_END
 
@@ -282,6 +309,18 @@ static const struct intel_image_format intel_image_formats[] = {
      { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 },
        { 1, 1, 1, __DRI_IMAGE_FORMAT_GR88, 2 } } },
 
+   { __DRI_IMAGE_FOURCC_P010, __DRI_IMAGE_COMPONENTS_Y_UV, 2,
+     { { 0, 0, 0, __DRI_IMAGE_FORMAT_R16, 2 },
+       { 1, 1, 1, __DRI_IMAGE_FORMAT_GR1616, 4 } } },
+
+   { __DRI_IMAGE_FOURCC_P012, __DRI_IMAGE_COMPONENTS_Y_UV, 2,
+     { { 0, 0, 0, __DRI_IMAGE_FORMAT_R16, 2 },
+       { 1, 1, 1, __DRI_IMAGE_FORMAT_GR1616, 4 } } },
+
+   { __DRI_IMAGE_FOURCC_P016, __DRI_IMAGE_COMPONENTS_Y_UV, 2,
+     { { 0, 0, 0, __DRI_IMAGE_FORMAT_R16, 2 },
+       { 1, 1, 1, __DRI_IMAGE_FORMAT_GR1616, 4 } } },
+
    { __DRI_IMAGE_FOURCC_NV16, __DRI_IMAGE_COMPONENTS_Y_UV, 2,
      { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 },
        { 1, 1, 0, __DRI_IMAGE_FORMAT_GR88, 2 } } },
diff --git a/src/mesa/drivers/dri/i965/meson.build b/src/mesa/drivers/dri/i965/meson.build
index cd3683ae7ec..0bc6125f19c 100644
--- a/src/mesa/drivers/dri/i965/meson.build
+++ b/src/mesa/drivers/dri/i965/meson.build
@@ -187,7 +187,7 @@ libi965 = static_library(
     i965_gen_libs, libintel_common, libintel_dev, libisl, libintel_compiler,
     libblorp
   ],
-  dependencies : [dep_libdrm, dep_valgrind, idep_nir_headers],
+  dependencies : [dep_libdrm, dep_valgrind, idep_nir_headers, idep_genxml],
 )
 
 dri_drivers += libi965
diff --git a/src/mesa/drivers/dri/meson.build b/src/mesa/drivers/dri/meson.build
index d98c823f5fe..dddc4ae3dfd 100644
--- a/src/mesa/drivers/dri/meson.build
+++ b/src/mesa/drivers/dri/meson.build
@@ -54,6 +54,10 @@ if dri_drivers != []
       dep_selinux, dep_libdrm, dep_expat, dep_m, dep_thread, dep_dl, idep_nir,
     ],
     link_args : [ld_args_build_id, ld_args_bsymbolic, ld_args_gc_sections],
+    # Will be deleted during installation, see install_megadrivers.py
+    install : true,
+    install_dir : dri_drivers_path,
+    name_suffix : 'so',
   )
 
   meson.add_install_script(
@@ -78,7 +82,7 @@ if with_dri
     filebase : 'dri',
     description : 'Direct Rendering Infrastructure',
     version : meson.project_version(),
-    variables : ['dridriverdir=${prefix}/' + dri_drivers_path],
+    variables : ['dridriverdir=' + dri_drivers_path],
     requires_private : dri_req_private,
   )
 endif
diff --git a/src/mesa/drivers/osmesa/meson.build b/src/mesa/drivers/osmesa/meson.build
index a406bb3c210..c479b740131 100644
--- a/src/mesa/drivers/osmesa/meson.build
+++ b/src/mesa/drivers/osmesa/meson.build
@@ -33,7 +33,8 @@ libosmesa = shared_library(
   include_directories : [
     inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux,
   ],
-  link_with : [libmesa_classic, libglapi_static, osmesa_link_with],
+  link_whole : libglapi_static,
+  link_with : [libmesa_classic, osmesa_link_with],
   dependencies : [dep_thread, dep_selinux],
   version : '8.0.0',
   install : true,
diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c
index 492f01de957..9ce8a94c5de 100644
--- a/src/mesa/main/context.c
+++ b/src/mesa/main/context.c
@@ -1760,6 +1760,10 @@ _mesa_make_current( struct gl_context *newCtx,
 
          check_init_viewport(newCtx, drawBuffer->Width, drawBuffer->Height);
       }
+      else {
+         _mesa_reference_framebuffer(&newCtx->WinSysDrawBuffer, NULL);
+         _mesa_reference_framebuffer(&newCtx->WinSysReadBuffer, NULL);
+      }
 
       if (newCtx->FirstTimeCurrent) {
          handle_first_current(newCtx);
diff --git a/src/mesa/main/dlist.c b/src/mesa/main/dlist.c
index 97461cede34..eb22fcbdb31 100644
--- a/src/mesa/main/dlist.c
+++ b/src/mesa/main/dlist.c
@@ -962,6 +962,8 @@ make_list(GLuint name, GLuint count)
    dlist->Name = name;
    dlist->Head = malloc(sizeof(Node) * count);
    dlist->Head[0].opcode = OPCODE_END_OF_LIST;
+   /* All InstSize[] entries must be non-zero */
+   InstSize[OPCODE_END_OF_LIST] = 1;
    return dlist;
 }
 
@@ -2753,6 +2755,7 @@ save_Fogiv(GLenum pname, const GLint *params)
    case GL_FOG_START:
    case GL_FOG_END:
    case GL_FOG_INDEX:
+   case GL_FOG_COORDINATE_SOURCE:
       p[0] = (GLfloat) *params;
       p[1] = 0.0f;
       p[2] = 0.0f;
diff --git a/src/mesa/main/errors.c b/src/mesa/main/errors.c
index a9687913627..30560ba047e 100644
--- a/src/mesa/main/errors.c
+++ b/src/mesa/main/errors.c
@@ -231,6 +231,9 @@ _mesa_gl_vdebug(struct gl_context *ctx,
    _mesa_debug_get_id(id);
 
    len = _mesa_vsnprintf(s, MAX_DEBUG_MESSAGE_LENGTH, fmtString, args);
+   if (len >= MAX_DEBUG_MESSAGE_LENGTH)
+      /* message was truncated */
+      len = MAX_DEBUG_MESSAGE_LENGTH - 1;
 
    _mesa_log_msg(ctx, source, type, *id, severity, len, s);
 }
diff --git a/src/mesa/main/fbobject.c b/src/mesa/main/fbobject.c
index 8290ea94dfc..341fd93efc6 100644
--- a/src/mesa/main/fbobject.c
+++ b/src/mesa/main/fbobject.c
@@ -4663,8 +4663,12 @@ get_fb_attachment(struct gl_context *ctx, struct gl_framebuffer *fb,
    case GL_COLOR_ATTACHMENT12:
    case GL_COLOR_ATTACHMENT13:
    case GL_COLOR_ATTACHMENT14:
-   case GL_COLOR_ATTACHMENT15:
-      return &fb->Attachment[BUFFER_COLOR0 + attachment - GL_COLOR_ATTACHMENT0];
+   case GL_COLOR_ATTACHMENT15: {
+      const unsigned i = attachment - GL_COLOR_ATTACHMENT0;
+      if (i >= ctx->Const.MaxColorAttachments)
+         return NULL;
+      return &fb->Attachment[BUFFER_COLOR0 + i];
+   }
    case GL_DEPTH:
    case GL_DEPTH_ATTACHMENT:
    case GL_DEPTH_STENCIL_ATTACHMENT:
@@ -4691,6 +4695,29 @@ discard_framebuffer(struct gl_context *ctx, struct gl_framebuffer *fb,
       if (!att)
          continue;
 
+      /* If we're asked to invalidate just depth or just stencil, but the
+       * attachment is packed depth/stencil, then we can only use
+       * Driver.DiscardFramebuffer if the attachments list includes both depth
+       * and stencil and they both point at the same renderbuffer.
+       */
+      if ((attachments[i] == GL_DEPTH_ATTACHMENT ||
+           attachments[i] == GL_STENCIL_ATTACHMENT) &&
+          (!att->Renderbuffer ||
+           att->Renderbuffer->_BaseFormat == GL_DEPTH_STENCIL)) {
+         GLenum other_format = (attachments[i] == GL_DEPTH_ATTACHMENT ?
+                                GL_STENCIL_ATTACHMENT : GL_DEPTH_ATTACHMENT);
+         bool has_both = false;
+         for (int j = 0; j < numAttachments; j++) {
+            if (attachments[j] == other_format)
+               has_both = true;
+            break;
+         }
+
+         if (fb->Attachment[BUFFER_DEPTH].Renderbuffer !=
+             fb->Attachment[BUFFER_STENCIL].Renderbuffer || !has_both)
+            continue;
+      }
+
       ctx->Driver.DiscardFramebuffer(ctx, fb, att);
    }
 }
diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
index ee77c45d03c..efc9c11f79d 100644
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -906,6 +906,9 @@ find_custom_value(struct gl_context *ctx, const struct value_desc *d, union valu
       break;
 
    /* GL_EXT_external_objects */
+   case GL_NUM_DEVICE_UUIDS_EXT:
+      v->value_int = 1;
+      break;
    case GL_DRIVER_UUID_EXT:
       _mesa_get_driver_uuid(ctx, v->value_int_4);
       break;
diff --git a/src/mesa/program/Android.mk b/src/mesa/program/Android.mk
index c6470e6289e..13d0da85882 100644
--- a/src/mesa/program/Android.mk
+++ b/src/mesa/program/Android.mk
@@ -41,7 +41,7 @@ endef
 include $(MESA_TOP)/src/mesa/Makefile.sources
 
 include $(CLEAR_VARS)
-
+LOCAL_CFLAGS += -Wno-error
 LOCAL_MODULE := libmesa_program
 LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 LOCAL_STATIC_LIBRARIES := libmesa_nir \
diff --git a/src/mesa/program/prog_parameter.c b/src/mesa/program/prog_parameter.c
index 2bc1b6db6eb..4073030f536 100644
--- a/src/mesa/program/prog_parameter.c
+++ b/src/mesa/program/prog_parameter.c
@@ -271,6 +271,7 @@ _mesa_add_parameter(struct gl_program_parameter_list *paramList,
    p->Name = strdup(name ? name : "");
    p->Type = type;
    p->Size = size;
+   p->Padded = pad_and_align;
    p->DataType = datatype;
 
    paramList->ParameterValueOffset[oldNum] = oldValNum;
diff --git a/src/mesa/program/prog_parameter.h b/src/mesa/program/prog_parameter.h
index cc551c18910..d3d5961f920 100644
--- a/src/mesa/program/prog_parameter.h
+++ b/src/mesa/program/prog_parameter.h
@@ -104,6 +104,12 @@ struct gl_program_parameter
     * A sequence of STATE_* tokens and integers to identify GL state.
     */
    gl_state_index16 StateIndexes[STATE_LENGTH];
+
+   /**
+    * We need to keep track of whether the param is padded for use in the
+    * shader cache.
+    */
+   bool Padded;
 };
 
 
diff --git a/src/mesa/state_tracker/st_cb_rasterpos.c b/src/mesa/state_tracker/st_cb_rasterpos.c
index fecaaf77da8..c54b50dc754 100644
--- a/src/mesa/state_tracker/st_cb_rasterpos.c
+++ b/src/mesa/state_tracker/st_cb_rasterpos.c
@@ -208,6 +208,10 @@ new_draw_rastpos_stage(struct gl_context *ctx, struct draw_context *draw)
    rs->prim.end = 1;
    rs->prim.start = 0;
    rs->prim.count = 1;
+   rs->prim.pad = 0;
+   rs->prim.num_instances = 1;
+   rs->prim.base_instance = 0;
+   rs->prim.is_indirect = 0;
 
    return rs;
 }
diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index 1e456d019d0..92e512a0f1c 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -223,8 +223,13 @@ void st_init_limits(struct pipe_screen *screen,
       pc->MaxUniformComponents = MIN2(pc->MaxUniformComponents,
                                       MAX_UNIFORMS * 4);
 
+      /* For ARB programs, prog_src_register::Index is a signed 13-bit number.
+       * This gives us a limit of 4096 values - but we may need to generate
+       * internal values in addition to what the source program uses.  So, we
+       * drop the limit one step lower, to 2048, to be safe.
+       */
       pc->MaxParameters =
-      pc->MaxNativeParameters = pc->MaxUniformComponents / 4;
+      pc->MaxNativeParameters = MIN2(pc->MaxUniformComponents / 4, 2048);
       pc->MaxInputComponents =
          screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_INPUTS) * 4;
       pc->MaxOutputComponents =
@@ -362,10 +367,7 @@ void st_init_limits(struct pipe_screen *screen,
    c->Program[MESA_SHADER_VERTEX].MaxAttribs =
       MIN2(c->Program[MESA_SHADER_VERTEX].MaxAttribs, 16);
 
-   /* PIPE_SHADER_CAP_MAX_INPUTS for the FS specifies the maximum number
-    * of inputs. It's always 2 colors + N generic inputs. */
-   c->MaxVarying = screen->get_shader_param(screen, PIPE_SHADER_FRAGMENT,
-                                            PIPE_SHADER_CAP_MAX_INPUTS);
+   c->MaxVarying = screen->get_param(screen, PIPE_CAP_MAX_VARYINGS);
    c->MaxVarying = MIN2(c->MaxVarying, MAX_VARYING);
    c->MaxGeometryOutputVertices =
       screen->get_param(screen, PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES);
diff --git a/src/mesa/state_tracker/st_format.c b/src/mesa/state_tracker/st_format.c
index aacb8788287..febde1a5e97 100644
--- a/src/mesa/state_tracker/st_format.c
+++ b/src/mesa/state_tracker/st_format.c
@@ -2356,6 +2356,8 @@ st_ChooseTextureFormat(struct gl_context *ctx, GLenum target,
       bindings |= PIPE_BIND_DEPTH_STENCIL;
    else if (is_renderbuffer || internalFormat == 3 || internalFormat == 4 ||
             internalFormat == GL_RGB || internalFormat == GL_RGBA ||
+            internalFormat == GL_RGBA2 ||
+            internalFormat == GL_RGB4 || internalFormat == GL_RGBA4 ||
             internalFormat == GL_RGB8 || internalFormat == GL_RGBA8 ||
             internalFormat == GL_BGRA ||
             internalFormat == GL_RGB16F ||
diff --git a/src/mesa/state_tracker/st_glsl_to_nir.cpp b/src/mesa/state_tracker/st_glsl_to_nir.cpp
index d7f2e3e6eaa..a05ec0fa586 100644
--- a/src/mesa/state_tracker/st_glsl_to_nir.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_nir.cpp
@@ -327,7 +327,7 @@ st_nir_opts(nir_shader *nir, bool scalar)
       NIR_PASS(progress, nir, nir_opt_if);
       NIR_PASS(progress, nir, nir_opt_dead_cf);
       NIR_PASS(progress, nir, nir_opt_cse);
-      NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true);
+      NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true);
 
       NIR_PASS(progress, nir, nir_opt_algebraic);
       NIR_PASS(progress, nir, nir_opt_constant_folding);
diff --git a/src/mesa/state_tracker/st_manager.c b/src/mesa/state_tracker/st_manager.c
index 5efbd1fa1d2..67f1fcaf5ef 100644
--- a/src/mesa/state_tracker/st_manager.c
+++ b/src/mesa/state_tracker/st_manager.c
@@ -1105,10 +1105,17 @@ st_api_make_current(struct st_api *stapi, struct st_context_iface *stctxi,
    else {
       GET_CURRENT_CONTEXT(ctx);
 
-      ret = _mesa_make_current(NULL, NULL, NULL);
-
-      if (ctx)
+      if (ctx) {
+         /* Before releasing the context, release its associated
+          * winsys buffers first. Then purge the context's winsys buffers list
+          * to free the resources of any winsys buffers that no longer have
+          * an existing drawable.
+          */
+         ret = _mesa_make_current(ctx, NULL, NULL);
          st_framebuffers_purge(ctx->st);
+      }
+
+      ret = _mesa_make_current(NULL, NULL, NULL);
    }
 
    return ret;
diff --git a/src/mesa/state_tracker/st_tgsi_lower_yuv.c b/src/mesa/state_tracker/st_tgsi_lower_yuv.c
index 6acd173adc9..73437ddda70 100644
--- a/src/mesa/state_tracker/st_tgsi_lower_yuv.c
+++ b/src/mesa/state_tracker/st_tgsi_lower_yuv.c
@@ -269,31 +269,39 @@ yuv_to_rgb(struct tgsi_transform_context *tctx,
    tctx->emit_instruction(tctx, &inst);
 
    /* DP3 dst.x, tmpA, imm[0] */
-   inst = dp3_instruction();
-   reg_dst(&inst.Dst[0], dst, TGSI_WRITEMASK_X);
-   reg_src(&inst.Src[0], &ctx->tmp[A].src, SWIZ(X, Y, Z, W));
-   reg_src(&inst.Src[1], &ctx->imm[0], SWIZ(X, Y, Z, W));
-   tctx->emit_instruction(tctx, &inst);
+   if (dst->Register.WriteMask & TGSI_WRITEMASK_X) {
+      inst = dp3_instruction();
+      reg_dst(&inst.Dst[0], dst, TGSI_WRITEMASK_X);
+      reg_src(&inst.Src[0], &ctx->tmp[A].src, SWIZ(X, Y, Z, W));
+      reg_src(&inst.Src[1], &ctx->imm[0], SWIZ(X, Y, Z, W));
+      tctx->emit_instruction(tctx, &inst);
+   }
 
    /* DP3 dst.y, tmpA, imm[1] */
-   inst = dp3_instruction();
-   reg_dst(&inst.Dst[0], dst, TGSI_WRITEMASK_Y);
-   reg_src(&inst.Src[0], &ctx->tmp[A].src, SWIZ(X, Y, Z, W));
-   reg_src(&inst.Src[1], &ctx->imm[1], SWIZ(X, Y, Z, W));
-   tctx->emit_instruction(tctx, &inst);
+   if (dst->Register.WriteMask & TGSI_WRITEMASK_Y) {
+      inst = dp3_instruction();
+      reg_dst(&inst.Dst[0], dst, TGSI_WRITEMASK_Y);
+      reg_src(&inst.Src[0], &ctx->tmp[A].src, SWIZ(X, Y, Z, W));
+      reg_src(&inst.Src[1], &ctx->imm[1], SWIZ(X, Y, Z, W));
+      tctx->emit_instruction(tctx, &inst);
+   }
 
    /* DP3 dst.z, tmpA, imm[2] */
-   inst = dp3_instruction();
-   reg_dst(&inst.Dst[0], dst, TGSI_WRITEMASK_Z);
-   reg_src(&inst.Src[0], &ctx->tmp[A].src, SWIZ(X, Y, Z, W));
-   reg_src(&inst.Src[1], &ctx->imm[2], SWIZ(X, Y, Z, W));
-   tctx->emit_instruction(tctx, &inst);
+   if (dst->Register.WriteMask & TGSI_WRITEMASK_Z) {
+      inst = dp3_instruction();
+      reg_dst(&inst.Dst[0], dst, TGSI_WRITEMASK_Z);
+      reg_src(&inst.Src[0], &ctx->tmp[A].src, SWIZ(X, Y, Z, W));
+      reg_src(&inst.Src[1], &ctx->imm[2], SWIZ(X, Y, Z, W));
+      tctx->emit_instruction(tctx, &inst);
+   }
 
    /* MOV dst.w, imm[0].x */
-   inst = mov_instruction();
-   reg_dst(&inst.Dst[0], dst, TGSI_WRITEMASK_W);
-   reg_src(&inst.Src[0], &ctx->imm[3], SWIZ(_, _, _, W));
-   tctx->emit_instruction(tctx, &inst);
+   if (dst->Register.WriteMask & TGSI_WRITEMASK_W) {
+      inst = mov_instruction();
+      reg_dst(&inst.Dst[0], dst, TGSI_WRITEMASK_W);
+      reg_src(&inst.Src[0], &ctx->imm[3], SWIZ(_, _, _, W));
+      tctx->emit_instruction(tctx, &inst);
+   }
 }
 
 static void
@@ -434,7 +442,7 @@ st_tgsi_lower_yuv(const struct tgsi_token *tokens, unsigned free_slots,
    /* TODO better job of figuring out how many extra tokens we need..
     * this is a pain about tgsi_transform :-/
     */
-   newlen = tgsi_num_tokens(tokens) + 120;
+   newlen = tgsi_num_tokens(tokens) + 300;
    newtoks = tgsi_alloc_tokens(newlen);
    if (!newtoks)
       return NULL;
diff --git a/src/util/00-mesa-defaults.conf b/src/util/00-mesa-defaults.conf
index cb0e6e659e2..c38334140b6 100644
--- a/src/util/00-mesa-defaults.conf
+++ b/src/util/00-mesa-defaults.conf
@@ -98,6 +98,11 @@ TODO: document the other workarounds.
             <option name="allow_glsl_builtin_variable_redeclaration" value="true" />
         </application>
 
+        <application name="Doom 3: BFG" executable="Doom3BFG.exe">
+            <option name="allow_glsl_builtin_variable_redeclaration" value="true" />
+            <option name="force_glsl_extensions_warn" value="true" />
+        </application>
+
         <application name="Dying Light" executable="DyingLightGame">
             <option name="allow_glsl_builtin_variable_redeclaration" value="true" />
         </application>
@@ -229,6 +234,9 @@ TODO: document the other workarounds.
         <application name="Civilization 6" executable="Civ6">
             <option name="mesa_glthread" value="true"/>
         </application>
+        <application name="Civilization 6" executable="Civ6Sub">
+            <option name="mesa_glthread" value="true"/>
+        </application>
 
         <application name="Dreamfall Chapters" executable="Dreamfall Chapters">
             <option name="mesa_glthread" value="true"/>
@@ -346,6 +354,9 @@ TODO: document the other workarounds.
         <application name="startplasma" executable="startplasma">
             <option name="adaptive_sync" value="false" />
         </application>
+        <application name="sddm-greeter" executable="sddm-greeter">
+            <option name="adaptive_sync" value="false" />
+        </application>
         <application name="krunner" executable="krunner">
             <option name="adaptive_sync" value="false" />
         </application>
@@ -423,8 +434,17 @@ TODO: document the other workarounds.
         <application name="ARK: Survival Evolved (and unintentionally the UE4 demo template)" executable="ShooterGame">
             <option name="radeonsi_clear_db_cache_before_clear" value="true" />
         </application>
+        <application name="Counter-Strike Global Offensive" executable="csgo_linux64">
+            <option name="radeonsi_zerovram" value="true" />
+        </application>
         <application name="No Mans Sky" executable="NMS.exe">
             <option name="radeonsi_zerovram" value="true" />
         </application>
+        <application name="Civilization 6" executable="Civ6">
+            <option name="radeonsi_enable_nir" value="true"/>
+        </application>
+        <application name="Civilization 6" executable="Civ6Sub">
+            <option name="radeonsi_enable_nir" value="true"/>
+        </application>
     </device>
 </driconf>
diff --git a/src/util/Android.mk b/src/util/Android.mk
index 2d59e1ae15e..6d770ca9575 100644
--- a/src/util/Android.mk
+++ b/src/util/Android.mk
@@ -41,8 +41,14 @@ LOCAL_C_INCLUDES := \
 	$(MESA_TOP)/src/gallium/include \
 	$(MESA_TOP)/src/gallium/auxiliary
 
+# If Android version >=8 MESA should static link libexpat else should dynamic link
+ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0)
+LOCAL_STATIC_LIBRARIES := \
+	libexpat
+else
 LOCAL_SHARED_LIBRARIES := \
 	libexpat
+endif
 
 LOCAL_MODULE := libmesa_util
 
diff --git a/src/util/bitscan.h b/src/util/bitscan.h
index dc89ac93f28..cdfecafaf01 100644
--- a/src/util/bitscan.h
+++ b/src/util/bitscan.h
@@ -112,6 +112,31 @@ u_bit_scan64(uint64_t *mask)
    return i;
 }
 
+/* Count bits set in mask */
+static inline int
+u_count_bits(unsigned *mask)
+{
+   unsigned v = *mask;
+   int c;
+   v = v - ((v >> 1) & 0x55555555);
+   v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
+   v = (v + (v >> 4)) & 0xF0F0F0F;
+   c = (int)((v * 0x1010101) >> 24);
+   return c;
+}
+
+static inline int
+u_count_bits64(uint64_t *mask)
+{
+   uint64_t v = *mask;
+   int c;
+   v = v - ((v >> 1) & 0x5555555555555555ull);
+   v = (v & 0x3333333333333333ull) + ((v >> 2) & 0x3333333333333333ull);
+   v = (v + (v >> 4)) & 0xF0F0F0F0F0F0F0Full;
+   c = (int)((v * 0x101010101010101ull) >> 56);
+   return c;
+}
+
 /* Determine if an unsigned value is a power of two.
  *
  * \note
diff --git a/src/util/bitset.h b/src/util/bitset.h
index 3b18abac793..7ccfffad45f 100644
--- a/src/util/bitset.h
+++ b/src/util/bitset.h
@@ -62,8 +62,8 @@
 #define BITSET_SET(x, b) ((x)[BITSET_BITWORD(b)] |= BITSET_BIT(b))
 #define BITSET_CLEAR(x, b) ((x)[BITSET_BITWORD(b)] &= ~BITSET_BIT(b))
 
-#define BITSET_MASK(b) ((b) == BITSET_WORDBITS ? ~0 : BITSET_BIT(b) - 1)
-#define BITSET_RANGE(b, e) (BITSET_MASK((e) + 1) & ~BITSET_MASK(b))
+#define BITSET_MASK(b) (((b) % BITSET_WORDBITS == 0) ? ~0 : BITSET_BIT(b) - 1)
+#define BITSET_RANGE(b, e) ((BITSET_MASK((e) + 1)) & ~(BITSET_BIT(b) - 1))
 
 /* bit range operations
  */
diff --git a/src/util/merge_driinfo.py b/src/util/merge_driinfo.py
index e6ccca5e0f3..a09218a3902 100644
--- a/src/util/merge_driinfo.py
+++ b/src/util/merge_driinfo.py
@@ -52,6 +52,14 @@ def __init__(self, name, defaults):
       self.defaults = defaults
 
 
+class Verbatim(object):
+   """
+   Represent a chunk of code that is copied into the result file verbatim.
+   """
+   def __init__(self):
+      self.string = ''
+
+
 class Section(object):
    """
    Represent a config section description as:
@@ -75,8 +83,29 @@ def parse_inputs(input_filenames):
          section = None
 
          linenum = 0
+         verbatim = None
          for line in infile:
             linenum += 1
+
+            if line.startswith('//= BEGIN VERBATIM'):
+               if verbatim is not None:
+                  print('{}:{}: nested verbatim'
+                        .format(input_filename, linenum))
+                  success = False
+                  continue
+               verbatim = Verbatim()
+
+            if verbatim is not None:
+               verbatim.string += line
+
+               if line.startswith('//= END VERBATIM'):
+                  if section is None:
+                     sections.append(verbatim)
+                  else:
+                     section.options.append(verbatim)
+                  verbatim = None
+               continue
+
             line = line.strip()
             if not line:
                continue
@@ -144,12 +173,17 @@ def merge_sections(section_list):
       assert section.name == merged_section.name
 
       for orig_option in section.options:
-         for merged_option in merged_section.options:
-            if orig_option.name == merged_option.name:
-               merged_option.defaults = orig_option.defaults
-               break
+         if isinstance(orig_option, Option):
+            for merged_option in merged_section.options:
+               if not isinstance(merged_option, Option):
+                  continue
+               if orig_option.name == merged_option.name:
+                  merged_option.defaults = orig_option.defaults
+                  break
+            else:
+               merged_section.options.append(Option(orig_option.name, orig_option.defaults))
          else:
-            merged_section.options.append(Option(orig_option.name, orig_option.defaults))
+            merged_section.options.append(orig_option)
 
    return merged_section
 
@@ -164,6 +198,10 @@ def merge_sections_lists(sections_lists):
 
    for idx,sections in enumerate(sections_lists):
       for base_section in sections:
+         if not isinstance(base_section, Section):
+            merged_sections.append(base_section)
+            continue
+
          original_sections = [base_section]
          for next_sections in sections_lists[idx+1:]:
             for j,section in enumerate(next_sections):
@@ -201,15 +239,23 @@ def main(input_filenames):
 
 DRI_CONF_BEGIN
 % for section in sections:
+% if isinstance(section, Section):
    DRI_CONF_SECTION_${section.name}
 % for option in section.options:
+% if isinstance(option, Option):
       DRI_CONF_${option.name}(${option.defaults})
+% else:
+${option.string}
+% endif
 % endfor
    DRI_CONF_SECTION_END
+% else:
+${section.string}
+% endif
 % endfor
 DRI_CONF_END""")
 
-   print(driinfo_h_template.render(sections=merged_sections_list))
+   print(driinfo_h_template.render(sections=merged_sections_list, Section=Section, Option=Option))
    return True
 
 
diff --git a/src/util/xmlpool/t_options.h b/src/util/xmlpool/t_options.h
index 80ddf0e203e..dd2b5c21760 100644
--- a/src/util/xmlpool/t_options.h
+++ b/src/util/xmlpool/t_options.h
@@ -338,12 +338,8 @@ DRI_CONF_OPT_BEGIN_B(radeonsi_commutative_blend_add, def) \
         DRI_CONF_DESC(en,gettext("Commutative additive blending optimizations (may cause rendering errors)")) \
 DRI_CONF_OPT_END
 
-#define DRI_CONF_RADEONSI_CLEAR_DB_CACHE_BEFORE_CLEAR(def) \
-DRI_CONF_OPT_BEGIN_B(radeonsi_clear_db_cache_before_clear, def) \
-        DRI_CONF_DESC(en,"Clear DB cache before fast depth clear") \
-DRI_CONF_OPT_END
-
 #define DRI_CONF_RADEONSI_ZERO_ALL_VRAM_ALLOCS(def) \
 DRI_CONF_OPT_BEGIN_B(radeonsi_zerovram, def) \
         DRI_CONF_DESC(en,"Zero all vram allocations") \
 DRI_CONF_OPT_END
+
diff --git a/src/vulkan/Android.mk b/src/vulkan/Android.mk
index 6253f1c3be9..99dd3f56313 100644
--- a/src/vulkan/Android.mk
+++ b/src/vulkan/Android.mk
@@ -39,6 +39,14 @@ LOCAL_C_INCLUDES := \
 	$(MESA_TOP)/include/vulkan \
 	$(MESA_TOP)/src/vulkan/util
 
+ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0)
+LOCAL_C_INCLUDES += \
+	frameworks/native/libs/nativebase/include \
+	frameworks/native/libs/nativewindow/include \
+	frameworks/native/libs/arect/include
+LOCAL_HEADER_LIBRARIES += libcutils_headers libsystem_headers
+endif
+
 LOCAL_GENERATED_SOURCES := $(addprefix $(intermediates)/, \
 	$(VULKAN_UTIL_GENERATED_FILES))
 
diff --git a/src/vulkan/wsi/wsi_common_display.c b/src/vulkan/wsi/wsi_common_display.c
index 20209b0cbd0..ccde1e0e679 100644
--- a/src/vulkan/wsi/wsi_common_display.c
+++ b/src/vulkan/wsi/wsi_common_display.c
@@ -960,8 +960,8 @@ static void
 wsi_display_destroy_buffer(struct wsi_display *wsi,
                            uint32_t buffer)
 {
-   (void) drmIoctl(wsi->fd, DRM_IOCTL_MODE_DESTROY_DUMB,
-                   &((struct drm_mode_destroy_dumb) { .handle = buffer }));
+   (void) drmIoctl(wsi->fd, DRM_IOCTL_GEM_CLOSE,
+                   &((struct drm_gem_close) { .handle = buffer }));
 }
 
 static VkResult
@@ -1798,6 +1798,30 @@ wsi_init_pthread_cond_monotonic(pthread_cond_t *cond)
    return ret;
 }
 
+
+/*
+ * Local version fo the libdrm helper. Added to avoid depending on bleeding
+ * edge version of the library.
+ */
+static int
+local_drmIsMaster(int fd)
+{
+   /* Detect master by attempting something that requires master.
+    *
+    * Authenticating magic tokens requires master and 0 is an
+    * internal kernel detail which we could use. Attempting this on
+    * a master fd would fail therefore fail with EINVAL because 0
+    * is invalid.
+    *
+    * A non-master fd will fail with EACCES, as the kernel checks
+    * for master before attempting to do anything else.
+    *
+    * Since we don't want to leak implementation details, use
+    * EACCES.
+    */
+   return drmAuthMagic(fd, 0) != -EACCES;
+}
+
 VkResult
 wsi_display_init_wsi(struct wsi_device *wsi_device,
                      const VkAllocationCallbacks *alloc,
@@ -1813,6 +1837,9 @@ wsi_display_init_wsi(struct wsi_device *wsi_device,
    }
 
    wsi->fd = display_fd;
+   if (wsi->fd != -1 && !local_drmIsMaster(wsi->fd))
+      wsi->fd = -1;
+
    wsi->alloc = alloc;
 
    list_inithead(&wsi->connectors);
diff --git a/src/vulkan/wsi/wsi_common_wayland.c b/src/vulkan/wsi/wsi_common_wayland.c
index c0132566ead..82139de31d9 100644
--- a/src/vulkan/wsi/wsi_common_wayland.c
+++ b/src/vulkan/wsi/wsi_common_wayland.c
@@ -701,9 +701,14 @@ wsi_wl_swapchain_acquire_next_image(struct wsi_swapchain *wsi_chain,
          }
       }
 
-      /* This time we do a blocking dispatch because we can't go
-       * anywhere until we get an event.
+      /* We now have to do a blocking dispatch, because all our images
+       * are in use and we cannot return one until the server does. However,
+       * if the client has requested non-blocking ANI, then we tell it up front
+       * that we have nothing to return.
        */
+      if (info->timeout == 0)
+         return VK_NOT_READY;
+
       int ret = wl_display_roundtrip_queue(chain->display->wl_display,
                                            chain->display->queue);
       if (ret < 0)