diff --git a/Android.common.mk b/Android.common.mk index 60459d16eba..8e9e10a238d 100644 --- a/Android.common.mk +++ b/Android.common.mk @@ -31,6 +31,7 @@ LOCAL_C_INCLUDES += \ MESA_VERSION := $(shell cat $(MESA_TOP)/VERSION) LOCAL_CFLAGS += \ + -O3 \ -Wno-error \ -Wno-unused-parameter \ -Wno-pointer-arith \ @@ -77,14 +78,23 @@ LOCAL_CFLAGS += \ -fvisibility=hidden \ -fno-math-errno \ -fno-trapping-math \ - -Wno-sign-compare + -Wno-sign-compare \ + -Wno-self-assign \ + -Wno-constant-logical-operand \ + -Wno-format \ + -Wno-incompatible-pointer-types \ + -Wno-enum-conversion LOCAL_CPPFLAGS += \ -D__STDC_CONSTANT_MACROS \ -D__STDC_FORMAT_MACROS \ -D__STDC_LIMIT_MACROS \ -Wno-error=non-virtual-dtor \ - -Wno-non-virtual-dtor + -Wno-non-virtual-dtor \ + -Wno-delete-non-virtual-dtor \ + -Wno-overloaded-virtual \ + -Wno-missing-braces \ + -Wno-deprecated-register # mesa requires at least c99 compiler LOCAL_CONLYFLAGS += \ diff --git a/Makefile.am b/Makefile.am index e7e14f5b3cd..6d3c8cc19b4 100644 --- a/Makefile.am +++ b/Makefile.am @@ -22,6 +22,7 @@ SUBDIRS = src AM_DISTCHECK_CONFIGURE_FLAGS = \ + --enable-autotools \ --enable-dri \ --enable-dri3 \ --enable-egl \ diff --git a/Readme.md b/Readme.md new file mode 100644 index 00000000000..5df295abc3a --- /dev/null +++ b/Readme.md @@ -0,0 +1,2 @@ +Any security related issues should be reported by following the instructions here: +https://01.org/security diff --git a/VERSION b/VERSION index 5bd94c44a5c..02f94dcfc16 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -19.0.0-devel +19.0.5 diff --git a/bin/.cherry-ignore b/bin/.cherry-ignore new file mode 100644 index 00000000000..6c3f01790e0 --- /dev/null +++ b/bin/.cherry-ignore @@ -0,0 +1,40 @@ +# Both of these were already merged with different shas +da48cba61ef6fefb799bf96e6364b70dbf4ec712 +c812c740e60c14060eb89db66039111881a0f42f + +# The commit these fix was reverted from 19.0, but fixed for 19.1 due +# to the number of fixes required to make that commit work +8d8f80af3a17354508f2ec9d6559c915d5be351d +0c0c69729b6d72a5297122856c8fe48510e90764 +0881e90c09965818b02e359474a6f7446b41d647 +b031c643491a92a5574c7a4bd659df33f2d89bb6 + +# These were manually rebased by Jason, thanks! +8ab95b849e66f3221d80a67eef2ec6e3730901a8 +5c30fffeec1732c21d600c036f95f8cdb1bb5487 + +# This doesn't actually appliy to 19.0 +29179f58c6ba8099859ea25900214dbbd3814a92 + +# This was superceeded by a manual backport from ken +6981069fc805da1afc867ca3c905075d146d7ff9 + +# This was manually backported +0bc1942c9ddce4e796322a7561f06af5dec0decd + +# This doesn't need to be applied, it already seems to exist in stable. +80dc78407d0d1e03ceddf8889b217e8fd113568d + +# This was backported manually +4f18c43d1df64135e8968a7d4fbfd2c9918b76ae + +# These were de-nominated since they don't apply nicley +88105375c978f9de82af8c654051e5aa16d61614 +c9358621276ae49162e58d4a16fe37abda6a347f + +# These are only for 19.1 +c3538ab5702ceeead284c2b5f9e700f3082c8135 +d2aa65eb1892f7b300ac24560f9dbda6b600b5a7 +78e35df52aa2f7d770f929a0866a0faa89c261a9 +0f1b070bad34c46c4bcc6c679fa533bf6b4b79e5 +ad2b4aa37806779bdfc15d704940136c3db21eb4 diff --git a/bin/get-pick-list.sh b/bin/get-pick-list.sh index 15f0e7d4a34..8fa4f438771 100755 --- a/bin/get-pick-list.sh +++ b/bin/get-pick-list.sh @@ -13,12 +13,12 @@ is_stable_nomination() { - git show --summary "$1" | grep -q -i -o "CC:.*mesa-stable" + git show --pretty=medium --summary "$1" | grep -q -i -o "CC:.*mesa-stable" } is_typod_nomination() { - git show --summary "$1" | grep -q -i -o "CC:.*mesa-dev" + git show --pretty=medium --summary "$1" | grep -q -i -o "CC:.*mesa-dev" } fixes= diff --git a/bin/install_megadrivers.py b/bin/install_megadrivers.py index d29b1911218..b5ac78887bf 100644 --- a/bin/install_megadrivers.py +++ b/bin/install_megadrivers.py @@ -35,7 +35,11 @@ def main(): args = parser.parse_args() if os.path.isabs(args.libdir): - to = os.path.join(os.environ.get('DESTDIR', '/'), args.libdir[1:]) + destdir = os.environ.get('DESTDIR') + if destdir: + to = os.path.join(destdir, args.libdir[1:]) + else: + to = args.libdir else: to = os.path.join(os.environ['MESON_INSTALL_DESTDIR_PREFIX'], args.libdir) @@ -45,7 +49,6 @@ def main(): if os.path.lexists(to): os.unlink(to) os.makedirs(to) - shutil.copy(args.megadriver, master) for driver in args.drivers: abs_driver = os.path.join(to, driver) @@ -67,7 +70,14 @@ def main(): name, ext = os.path.splitext(name) finally: os.chdir(ret) + + # Remove meson-created master .so and symlinks os.unlink(master) + name, ext = os.path.splitext(master) + while ext != '.so': + if os.path.lexists(name): + os.unlink(name) + name, ext = os.path.splitext(name) if __name__ == '__main__': diff --git a/configure.ac b/configure.ac index 858da79f4d0..b288ecbd265 100644 --- a/configure.ac +++ b/configure.ac @@ -122,7 +122,7 @@ LLVM_REQUIRED_OPENCL=3.9.0 LLVM_REQUIRED_R600=3.9.0 LLVM_REQUIRED_RADEONSI=7.0.0 LLVM_REQUIRED_RADV=7.0.0 -LLVM_REQUIRED_SWR=6.0.0 +LLVM_REQUIRED_SWR=7.0.0 dnl Check for progs AC_PROG_CPP @@ -1922,7 +1922,7 @@ if test x"$enable_dri3" = xyes; then dri3_modifier_modules="xcb-dri3 >= $XCBDRI3_MODIFIERS_REQUIRED xcb-present >= $XCBPRESENT_MODIFIERS_REQUIRED" PKG_CHECK_MODULES([XCB_DRI3_MODIFIERS], [$dri3_modifier_modules], [have_dri3_modifiers=yes], [have_dri3_modifiers=no]) - if test "x$have_dri3_modifiers" == xyes; then + if test "x$have_dri3_modifiers" = xyes; then DEFINES="$DEFINES -DHAVE_DRI3_MODIFIERS" fi fi @@ -2357,7 +2357,7 @@ if test "x$enable_xvmc" = xyes -o \ "x$enable_omx_tizonia" = xyes -o \ "x$enable_va" = xyes; then if echo $platforms | grep -q "x11"; then - PKG_CHECK_MODULES([VL], [x11-xcb xcb xcb-dri2 >= $XCBDRI2_REQUIRED]) + PKG_CHECK_MODULES([VL], [x11-xcb xcb xcb-dri2 >= $XCBDRI2_REQUIRED libdrm >= $LIBDRM_REQUIRED]) fi need_gallium_vl_winsys=yes fi @@ -2845,8 +2845,8 @@ if test -n "$with_gallium_drivers"; then fi # XXX: Keep in sync with LLVM_REQUIRED_SWR -AM_CONDITIONAL(SWR_INVALID_LLVM_VERSION, test "x$LLVM_VERSION" != x6.0.0 -a \ - "x$LLVM_VERSION" != x6.0.1) +AM_CONDITIONAL(SWR_INVALID_LLVM_VERSION, test "x$LLVM_VERSION" != x7.0.0 -a \ + "x$LLVM_VERSION" != x7.0.1) if test "x$enable_llvm" = "xyes" -a "$with_gallium_drivers"; then llvm_require_version $LLVM_REQUIRED_GALLIUM "gallium" @@ -2949,7 +2949,7 @@ if test "x$enable_llvm" = xyes; then dnl the LLVM library propagated in the Libs.private of the respective .pc dnl file which ensures complete dependency information when statically dnl linking. - if test "x$enable_glx" == xgallium-xlib; then + if test "x$enable_glx" = xgallium-xlib; then GL_PC_LIB_PRIV="$GL_PC_LIB_PRIV $LLVM_LIBS" fi if test "x$enable_gallium_osmesa" = xyes; then diff --git a/docs/envvars.html b/docs/envvars.html index c9733e65234..43d3a6cf169 100644 --- a/docs/envvars.html +++ b/docs/envvars.html @@ -338,6 +338,9 @@

VMware SVGA driver environment variables

for details.
  • SVGA_EXTRA_LOGGING - if set, enables extra logging to the vmware.log file, such as the OpenGL program's name and command line arguments. +
  • SVGA_NO_LOGGING - if set, disables logging to the vmware.log file. +This is useful when using Valgrind because it otherwise crashes when +initializing the host log feature.
  • See the driver code for other, lesser-used variables. diff --git a/docs/relnotes/19.0.0.html b/docs/relnotes/19.0.0.html index 1b4edd7ce76..ea22d660f37 100644 --- a/docs/relnotes/19.0.0.html +++ b/docs/relnotes/19.0.0.html @@ -32,7 +32,8 @@

    Mesa 19.0.0 Release Notes / TBD

    SHA256 checksums

    -TBD.
    +  4c5b9c5227d37c1f6bdc786a6fa7ee7fbce40b2e8a87340c7d3234534ece3304  mesa-19.0.0.tar.gz
    +  5a549dfb40ec31e5c36c47aadac04554cb2e2a8d144a046a378fc16da57e38f8  mesa-19.0.0.tar.xz
     
    @@ -60,13 +61,2413 @@

    New features

    Bug fixes

    Changes

    diff --git a/docs/relnotes/19.0.1.html b/docs/relnotes/19.0.1.html new file mode 100644 index 00000000000..d5f82f9b022 --- /dev/null +++ b/docs/relnotes/19.0.1.html @@ -0,0 +1,159 @@ + + + + + Mesa Release Notes + + + + +
    +

    The Mesa 3D Graphics Library

    +
    + + +
    + +

    Mesa 19.0.1 Release Notes / March 27, 2019

    + +

    +Mesa 19.0.1 is a bug fix release which fixes bugs found since the 19.0.0 release. +

    +

    +Mesa 19.0.1 implements the OpenGL 4.5 API, but the version reported by +glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) / +glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used. +Some drivers don't support all the features required in OpenGL 4.5. OpenGL +4.5 is only available if requested at context creation. +Compatibility contexts may report a lower version depending on each driver. +

    + + +

    SHA256 checksums

    +
    +f1dd1980ed628edea3935eed7974fbc5d8353e9578c562728b880d63ac613dbd  mesa-19.0.1.tar.gz
    +6884163c0ea9e4c98378ab8fecd72fe7b5f437713a14471beda378df247999d4  mesa-19.0.1.tar.xz
    +
    + + +

    New features

    +

    None

    + + +

    Bug fixes

    + + + +

    Changes

    + +

    Andres Gomez (4):

    + + +

    Bas Nieuwenhuizen (2):

    + + +

    Danylo Piliaiev (2):

    + + +

    Dave Airlie (1):

    + + +

    Dylan Baker (5):

    + + +

    Eric Anholt (1):

    + + +

    Jason Ekstrand (6):

    + + +

    Józef Kucia (2):

    + + +

    Kenneth Graunke (1):

    + + +

    Kevin Strasser (1):

    + + +

    Mark Janes (1):

    + + +

    Plamena Manolova (1):

    + + +

    Samuel Pitoiset (3):

    + + +

    Sergii Romantsov (1):

    + + +

    Tapani Pälli (2):

    + + + +
    + + diff --git a/docs/relnotes/19.0.2.html b/docs/relnotes/19.0.2.html new file mode 100644 index 00000000000..e760bd4ac21 --- /dev/null +++ b/docs/relnotes/19.0.2.html @@ -0,0 +1,122 @@ + + + + + Mesa Release Notes + + + + +
    +

    The Mesa 3D Graphics Library

    +
    + + +
    + +

    Mesa 19.0.2 Release Notes / April 10, 2019

    + +

    +Mesa 19.0.2 is a bug fix release which fixes bugs found since the 19.0.1 release. +

    +

    +Mesa 19.0.2 implements the OpenGL 4.5 API, but the version reported by +glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) / +glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used. +Some drivers don't support all the features required in OpenGL 4.5. OpenGL +4.5 is only available if requested at context creation. +Compatibility contexts may report a lower version depending on each driver. +

    + + +

    SHA256 checksums

    +
    +SHA256: eb972fc11d4e1261d34ec0b91a701f158d4870c0428fb108353ae7eab64b1118  mesa-19.0.2.tar.gz
    +SHA256: 1a2edc3ce56906a676c91e6851298db45903df1f5cb9827395a922c1452db802  mesa-19.0.2.tar.xz
    +
    + + +

    New features

    + + +

    Bug fixes

    + + + + +

    Changes

    + + +

    Boyuan Zhang (1):

    + + +

    Caio Marcelo de Oliveira Filho (1):

    + + +

    Dylan Baker (2):

    + + +

    Eric Anholt (3):

    + + +

    Eric Engestrom (1):

    + + +

    Jason Ekstrand (1):

    + + +

    Karol Herbst (1):

    + + +

    Leo Liu (2):

    + + +

    Lionel Landwerlin (1):

    + + +

    Marek Olšák (1):

    + + +

    Samuel Pitoiset (2):

    + + +
    + + diff --git a/docs/relnotes/19.0.3.html b/docs/relnotes/19.0.3.html new file mode 100644 index 00000000000..d0fe3deb1ba --- /dev/null +++ b/docs/relnotes/19.0.3.html @@ -0,0 +1,148 @@ + + + + + Mesa Release Notes + + + + +
    +

    The Mesa 3D Graphics Library

    +
    + + +
    + +

    Mesa 19.0.3 Release Notes / April 24, 2019

    + +

    +Mesa 19.0.3 is a bug fix release which fixes bugs found since the l9.0.2 release. +

    +

    +Mesa 19.0.3 implements the OpenGL 4.5 API, but the version reported by +glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) / +glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used. +Some drivers don't support all the features required in OpenGL 4.5. OpenGL +4.5 is only available if requested at context creation. +Compatibility contexts may report a lower version depending on each driver. +

    + + +

    SHA256 checksums

    +
    +59543ec3c9f8c72990e77887f13d1678cb6739e5d5f56abc21ebf9e772389c5e  mesa-19.0.3.tar.gz
    +f027244e38dc309a4c12db45ef79be81ab62c797a50a88d566e4edb6159fc4d5  mesa-19.0.3.tar.xz
    +
    + + +

    New features

    + +

    N/A

    + +

    Bug fixes

    + + + +

    Changes

    + +

    Andres Gomez (1):

    + + +

    Bas Nieuwenhuizen (1):

    + + +

    Chia-I Wu (1):

    + + +

    Danylo Piliaiev (1):

    + + +

    Dylan Baker (2):

    + + +

    Eric Anholt (1):

    + + +

    Eric Engestrom (1):

    + + +

    Jason Ekstrand (2):

    + + +

    Juan A. Suarez Romero (1):

    + + +

    Kenneth Graunke (2):

    + + +

    Lionel Landwerlin (2):

    + + +

    Lubomir Rintel (2):

    + + +

    Marek Olšák (1):

    + + +

    Rhys Perry (1):

    + + +

    Roland Scheidegger (1):

    + + +

    Samuel Pitoiset (2):

    + + + +
    + + diff --git a/docs/relnotes/19.0.4.html b/docs/relnotes/19.0.4.html new file mode 100644 index 00000000000..7c1d493f9ee --- /dev/null +++ b/docs/relnotes/19.0.4.html @@ -0,0 +1,243 @@ + + + + + Mesa Release Notes + + + + +
    +

    The Mesa 3D Graphics Library

    +
    + + +
    + +

    Mesa 19.0.4 Release Notes / May 9, 2019

    + +

    +Mesa 19.0.4 is a bug fix release which fixes bugs found since the 19.0.3 release. +

    +

    +Mesa 19.0.4 implements the OpenGL 4.5 API, but the version reported by +glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) / +glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used. +Some drivers don't support all the features required in OpenGL 4.5. OpenGL +4.5 is only available if requested at context creation. +Compatibility contexts may report a lower version depending on each driver. +

    + + +

    SHA256 checksums

    +
    +de361c76bf7aae09219f571b9ae77a34864a1cd9f6ba24c845b18b3cd5e4b9a2  mesa-19.0.4.tar.gz
    +39f9f32f448d77388ef817c6098d50eb0c1595815ce7e895dec09dd68774ce47  mesa-19.0.4.tar.xz
    +
    + + +

    New features

    + +

    N/A

    + +

    Bug fixes

    + + + +

    Changes

    + +

    Alejandro Piñeiro (1):

    + + +

    Andrii Simiklit (1):

    + + +

    Axel Davy (1):

    + + +

    Bas Nieuwenhuizen (1):

    + + +

    Brian Paul (1):

    + + +

    Caio Marcelo de Oliveira Filho (1):

    + + +

    Charmaine Lee (1):

    + + +

    Chuck Atkins (1):

    + + +

    Daniel Stone (1):

    + + +

    Dave Airlie (2):

    + + +

    Dylan Baker (7):

    + + +

    Emil Velikov (3):

    + + +

    Erik Faye-Lund (2):

    + + +

    Francisco Jerez (2):

    + + +

    Hal Gentz (1):

    + + +

    Ian Romanick (2):

    + + +

    Jason Ekstrand (1):

    + + +

    Jon Turney (1):

    + + +

    Juan A. Suarez Romero (2):

    + + +

    Kenneth Graunke (6):

    + + +

    Lionel Landwerlin (4):

    + + +

    Marek Olšák (2):

    + + +

    Nicolai Hähnle (1):

    + + +

    Rhys Perry (1):

    + + +

    Ross Burton (1):

    + + +

    Samuel Pitoiset (8):

    + + +

    Tapani Pälli (1):

    + + +

    Timothy Arceri (4):

    + + +
    + + diff --git a/docs/relnotes/19.0.5.html b/docs/relnotes/19.0.5.html new file mode 100644 index 00000000000..bf0cd43eaaa --- /dev/null +++ b/docs/relnotes/19.0.5.html @@ -0,0 +1,137 @@ + + + + + Mesa Release Notes + + + + +
    +

    The Mesa 3D Graphics Library

    +
    + + +
    + +

    Mesa 19.0.5 Release Notes / May 21, 2019

    + +

    +Mesa 19.0.5 is a bug fix release which fixes bugs found since the 19.0.4 release. +

    +

    +Mesa 19.0.5 implements the OpenGL 4.5 API, but the version reported by +glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) / +glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used. +Some drivers don't support all the features required in OpenGL 4.5. OpenGL +4.5 is only available if requested at context creation. +Compatibility contexts may report a lower version depending on each driver. +

    + + +

    SHA256 checksums

    +
    +b6e6b78c23bec15d1e7887c78b7ad00ce395ea1b20ad8aab6ce441f55f724e70  mesa-19.0.5.tar.gz
    +6aecb7f67c136768692fb3c33a54196186c6c4fcafab7973516a355e1a54f831  mesa-19.0.5.tar.xz
    +
    + + +

    New features

    + +

    N/A

    + +

    Bug fixes

    + + + +

    Changes

    + +

    Caio Marcelo de Oliveira Filho (2):

    + + +

    Charmaine Lee (2):

    + + +

    Dylan Baker (4):

    + + +

    Eric Engestrom (1):

    + + +

    Gert Wollny (2):

    + + +

    Ian Romanick (1):

    + + +

    Jason Ekstrand (3):

    + + +

    Józef Kucia (1):

    + + +

    Kenneth Graunke (1):

    + + +

    Leo Liu (1):

    + + +

    Lionel Landwerlin (1):

    + + +

    Marek Olšák (1):

    + + +

    Nanley Chery (1):

    + + +

    Samuel Pitoiset (1):

    + + +
    + + diff --git a/docs/shading.html b/docs/shading.html index 9e3c7218e31..76f25316f86 100644 --- a/docs/shading.html +++ b/docs/shading.html @@ -59,6 +59,7 @@

    Environment Variables

  • nopfrag - force fragment shader to be a simple shader that passes through the color attribute.
  • useprog - log glUseProgram calls to stderr +
  • errors - GLSL compilation and link errors will be reported to stderr.

    Example: export MESA_GLSL=dump,nopt diff --git a/include/GL/internal/dri_interface.h b/include/GL/internal/dri_interface.h index f2e46f65f92..6d134e3a40f 100644 --- a/include/GL/internal/dri_interface.h +++ b/include/GL/internal/dri_interface.h @@ -1352,6 +1352,10 @@ struct __DRIdri2ExtensionRec { #define __DRI_IMAGE_FOURCC_YVU422 0x36315659 #define __DRI_IMAGE_FOURCC_YVU444 0x34325659 +#define __DRI_IMAGE_FOURCC_P010 0x30313050 +#define __DRI_IMAGE_FOURCC_P012 0x32313050 +#define __DRI_IMAGE_FOURCC_P016 0x36313050 + /** * Queryable on images created by createImageFromNames. * diff --git a/include/pci_ids/i965_pci_ids.h b/include/pci_ids/i965_pci_ids.h index 7201562d824..b91abd7a3f9 100644 --- a/include/pci_ids/i965_pci_ids.h +++ b/include/pci_ids/i965_pci_ids.h @@ -171,6 +171,7 @@ CHIPSET(0x3185, glk_2x6, "Intel(R) UHD Graphics 600 (Geminilake 2x6)") CHIPSET(0x3E90, cfl_gt1, "Intel(R) UHD Graphics 610 (Coffeelake 2x6 GT1)") CHIPSET(0x3E93, cfl_gt1, "Intel(R) UHD Graphics 610 (Coffeelake 2x6 GT1)") CHIPSET(0x3E99, cfl_gt1, "Intel(R) HD Graphics (Coffeelake 2x6 GT1)") +CHIPSET(0x3E9C, cfl_gt1, "Intel(R) HD Graphics (Coffeelake 2x6 GT1)") CHIPSET(0x3E91, cfl_gt2, "Intel(R) UHD Graphics 630 (Coffeelake 3x8 GT2)") CHIPSET(0x3E92, cfl_gt2, "Intel(R) UHD Graphics 630 (Coffeelake 3x8 GT2)") CHIPSET(0x3E96, cfl_gt2, "Intel(R) HD Graphics (Coffeelake 3x8 GT2)") @@ -203,6 +204,10 @@ CHIPSET(0x5A54, cnl_5x8, "Intel(R) HD Graphics (Cannonlake 5x8 GT2)") CHIPSET(0x8A50, icl_8x8, "Intel(R) HD Graphics (Ice Lake 8x8 GT2)") CHIPSET(0x8A51, icl_8x8, "Intel(R) HD Graphics (Ice Lake 8x8 GT2)") CHIPSET(0x8A52, icl_8x8, "Intel(R) HD Graphics (Ice Lake 8x8 GT2)") +CHIPSET(0x8A56, icl_4x8, "Intel(R) HD Graphics (Ice Lake 4x8 GT1)") +CHIPSET(0x8A57, icl_6x8, "Intel(R) HD Graphics (Ice Lake 6x8 GT1.5)") +CHIPSET(0x8A58, icl_4x8, "Intel(R) HD Graphics (Ice Lake 4x8 GT1)") +CHIPSET(0x8A59, icl_6x8, "Intel(R) HD Graphics (Ice Lake 6x8 GT1.5)") CHIPSET(0x8A5A, icl_6x8, "Intel(R) HD Graphics (Ice Lake 6x8 GT1.5)") CHIPSET(0x8A5B, icl_4x8, "Intel(R) HD Graphics (Ice Lake 4x8 GT1)") CHIPSET(0x8A5C, icl_6x8, "Intel(R) HD Graphics (Ice Lake 6x8 GT1.5)") diff --git a/meson.build b/meson.build index d975b0dbf4b..5286b91c615 100644 --- a/meson.build +++ b/meson.build @@ -61,11 +61,11 @@ endif dri_drivers_path = get_option('dri-drivers-path') if dri_drivers_path == '' - dri_drivers_path = join_paths(get_option('libdir'), 'dri') + dri_drivers_path = join_paths(get_option('prefix'), get_option('libdir'), 'dri') endif dri_search_path = get_option('dri-search-path') if dri_search_path == '' - dri_search_path = join_paths(get_option('prefix'), dri_drivers_path) + dri_search_path = dri_drivers_path endif with_gles1 = get_option('gles1') @@ -608,7 +608,7 @@ with_gallium_xa = _xa != 'false' d3d_drivers_path = get_option('d3d-drivers-path') if d3d_drivers_path == '' - d3d_drivers_path = join_paths(get_option('libdir'), 'd3d') + d3d_drivers_path = join_paths(get_option('prefix'), get_option('libdir'), 'd3d') endif with_gallium_st_nine = get_option('gallium-nine') @@ -1213,6 +1213,7 @@ if _llvm != 'false' with_gallium_opencl or _llvm == 'true' ), static : not _shared_llvm, + method : 'config-tool', ) with_llvm = dep_llvm.found() endif @@ -1387,12 +1388,14 @@ if with_platform_x11 dep_xshmfence = dependency('xshmfence', version : '>= 1.1') endif endif - if with_glx == 'dri' + if with_glx == 'dri' or with_glx == 'gallium-xlib' + dep_glproto = dependency('glproto', version : '>= 1.4.14') + endif + if with_glx == 'dri' if with_dri_platform == 'drm' dep_dri2proto = dependency('dri2proto', version : '>= 2.8') dep_xxf86vm = dependency('xxf86vm') endif - dep_glproto = dependency('glproto', version : '>= 1.4.14') endif if (with_egl or ( with_gallium_vdpau or with_gallium_xvmc or with_gallium_xa or @@ -1400,7 +1403,7 @@ if with_platform_x11 dep_xcb_xfixes = dependency('xcb-xfixes') endif if with_xlib_lease - dep_xcb_xrandr = dependency('xcb-randr', version : '>= 1.12') + dep_xcb_xrandr = dependency('xcb-randr') dep_xlib_xrandr = dependency('xrandr', version : '>= 1.3') endif endif diff --git a/scons/custom.py b/scons/custom.py index 09946fa7324..8028990ef61 100644 --- a/scons/custom.py +++ b/scons/custom.py @@ -48,7 +48,12 @@ # a path directly. We want to support both, so we need to detect the SCons version, # for which no API is provided by SCons 8-P -scons_version = tuple(map(int, SCons.__version__.split('.'))) +# Scons version string has consistently been in this format: +# MajorVersion.MinorVersion.Patch[.alpha/beta.yyyymmdd] +# so this formula should cover all versions regardless of type +# stable, alpha or beta. +# For simplicity alpha and beta flags are removed. +scons_version = tuple(map(int, SCons.__version__.split('.')[:3])) def quietCommandLines(env): # Quiet command lines diff --git a/scons/gallium.py b/scons/gallium.py index 963834a5fbc..61bbeb2399f 100755 --- a/scons/gallium.py +++ b/scons/gallium.py @@ -308,7 +308,20 @@ def generate(env): if env.GetOption('num_jobs') <= 1: env.SetOption('num_jobs', num_jobs()) - env.Decider('MD5-timestamp') + # Speed up dependency checking. See + # - https://github.com/SCons/scons/wiki/GoFastButton + # - https://bugs.freedesktop.org/show_bug.cgi?id=109443 + + # Scons version string has consistently been in this format: + # MajorVersion.MinorVersion.Patch[.alpha/beta.yyyymmdd] + # so this formula should cover all versions regardless of type + # stable, alpha or beta. + # For simplicity alpha and beta flags are removed. + + scons_version = distutils.version.StrictVersion('.'.join(SCons.__version__.split('.')[:3])) + if scons_version < distutils.version.StrictVersion('3.0.2') or \ + scons_version > distutils.version.StrictVersion('3.0.4'): + env.Decider('MD5-timestamp') env.SetOption('max_drift', 60) # C preprocessor options diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c index fc8c6a09d2f..7ba13c24953 100644 --- a/src/amd/common/ac_gpu_info.c +++ b/src/amd/common/ac_gpu_info.c @@ -367,9 +367,7 @@ bool ac_query_gpu_info(int fd, amdgpu_device_handle dev, info->has_syncobj_wait_for_submit = info->has_syncobj && info->drm_minor >= 20; info->has_fence_to_handle = info->has_syncobj && info->drm_minor >= 21; info->has_ctx_priority = info->drm_minor >= 22; - /* TODO: Enable this once the kernel handles it efficiently. */ - info->has_local_buffers = info->drm_minor >= 20 && - !info->has_dedicated_vram; + info->has_local_buffers = info->drm_minor >= 20; info->kernel_flushes_hdp_before_ib = true; info->htile_cmask_support_1d_tiling = true; info->si_TA_CS_BC_BASE_ADDR_allowed = true; diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index 768364b2dc6..3d7eb7b0421 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -905,6 +905,37 @@ ac_build_fs_interp(struct ac_llvm_context *ctx, ctx->f32, args, 5, AC_FUNC_ATTR_READNONE); } +LLVMValueRef +ac_build_fs_interp_f16(struct ac_llvm_context *ctx, + LLVMValueRef llvm_chan, + LLVMValueRef attr_number, + LLVMValueRef params, + LLVMValueRef i, + LLVMValueRef j) +{ + LLVMValueRef args[6]; + LLVMValueRef p1; + + args[0] = i; + args[1] = llvm_chan; + args[2] = attr_number; + args[3] = ctx->i1false; + args[4] = params; + + p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16", + ctx->f32, args, 5, AC_FUNC_ATTR_READNONE); + + args[0] = p1; + args[1] = j; + args[2] = llvm_chan; + args[3] = attr_number; + args[4] = ctx->i1false; + args[5] = params; + + return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16", + ctx->f16, args, 6, AC_FUNC_ATTR_READNONE); +} + LLVMValueRef ac_build_fs_interp_mov(struct ac_llvm_context *ctx, LLVMValueRef parameter, @@ -923,6 +954,14 @@ ac_build_fs_interp_mov(struct ac_llvm_context *ctx, ctx->f32, args, 4, AC_FUNC_ATTR_READNONE); } +LLVMValueRef +ac_build_gep_ptr(struct ac_llvm_context *ctx, + LLVMValueRef base_ptr, + LLVMValueRef index) +{ + return LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, ""); +} + LLVMValueRef ac_build_gep0(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h index e47893bbbe6..370e7e9741c 100644 --- a/src/amd/common/ac_llvm_build.h +++ b/src/amd/common/ac_llvm_build.h @@ -216,6 +216,14 @@ ac_build_fs_interp(struct ac_llvm_context *ctx, LLVMValueRef i, LLVMValueRef j); +LLVMValueRef +ac_build_fs_interp_f16(struct ac_llvm_context *ctx, + LLVMValueRef llvm_chan, + LLVMValueRef attr_number, + LLVMValueRef params, + LLVMValueRef i, + LLVMValueRef j); + LLVMValueRef ac_build_fs_interp_mov(struct ac_llvm_context *ctx, LLVMValueRef parameter, @@ -223,6 +231,11 @@ ac_build_fs_interp_mov(struct ac_llvm_context *ctx, LLVMValueRef attr_number, LLVMValueRef params); +LLVMValueRef +ac_build_gep_ptr(struct ac_llvm_context *ctx, + LLVMValueRef base_ptr, + LLVMValueRef index); + LLVMValueRef ac_build_gep0(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, diff --git a/src/amd/common/ac_llvm_util.c b/src/amd/common/ac_llvm_util.c index 69446863b95..6063411310b 100644 --- a/src/amd/common/ac_llvm_util.c +++ b/src/amd/common/ac_llvm_util.c @@ -151,13 +151,14 @@ static LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family, LLVMTargetRef target = ac_get_llvm_target(triple); snprintf(features, sizeof(features), - "+DumpCode,-fp32-denormals,+fp64-denormals%s%s%s%s%s", + "+DumpCode,-fp32-denormals,+fp64-denormals%s%s%s%s%s%s", HAVE_LLVM >= 0x0800 ? "" : ",+vgpr-spilling", tm_options & AC_TM_SISCHED ? ",+si-scheduler" : "", tm_options & AC_TM_FORCE_ENABLE_XNACK ? ",+xnack" : "", tm_options & AC_TM_FORCE_DISABLE_XNACK ? ",-xnack" : "", - tm_options & AC_TM_PROMOTE_ALLOCA_TO_SCRATCH ? ",-promote-alloca" : ""); - + tm_options & AC_TM_PROMOTE_ALLOCA_TO_SCRATCH ? ",-promote-alloca" : "", + tm_options & AC_TM_NO_LOAD_STORE_OPT ? ",-load-store-opt" : ""); + LLVMTargetMachineRef tm = LLVMCreateTargetMachine( target, triple, diff --git a/src/amd/common/ac_llvm_util.h b/src/amd/common/ac_llvm_util.h index 6d961c06f8a..ca00540da80 100644 --- a/src/amd/common/ac_llvm_util.h +++ b/src/amd/common/ac_llvm_util.h @@ -65,6 +65,7 @@ enum ac_target_machine_options { AC_TM_CHECK_IR = (1 << 5), AC_TM_ENABLE_GLOBAL_ISEL = (1 << 6), AC_TM_CREATE_LOW_OPT = (1 << 7), + AC_TM_NO_LOAD_STORE_OPT = (1 << 8), }; enum ac_float_mode { diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index efd3e260af1..a0815995b12 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -1019,10 +1019,17 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) LLVMValueRef in[3]; for (unsigned chan = 0; chan < 3; chan++) in[chan] = ac_llvm_extract_elem(&ctx->ac, src[0], chan); - results[0] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubetc", + results[0] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubesc", ctx->ac.f32, in, 3, AC_FUNC_ATTR_READNONE); - results[1] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubesc", + results[1] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubetc", ctx->ac.f32, in, 3, AC_FUNC_ATTR_READNONE); + LLVMValueRef ma = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubema", + ctx->ac.f32, in, 3, AC_FUNC_ATTR_READNONE); + results[0] = ac_build_fdiv(&ctx->ac, results[0], ma); + results[1] = ac_build_fdiv(&ctx->ac, results[1], ma); + LLVMValueRef offset = LLVMConstReal(ctx->ac.f32, 0.5); + results[0] = LLVMBuildFAdd(ctx->ac.builder, results[0], offset, ""); + results[1] = LLVMBuildFAdd(ctx->ac.builder, results[1], offset, ""); result = ac_build_gather_values(&ctx->ac, results, 2); break; } @@ -1896,14 +1903,18 @@ static LLVMValueRef visit_load_var(struct ac_nir_context *ctx, if (var) { bool vs_in = ctx->stage == MESA_SHADER_VERTEX && var->data.mode == nir_var_shader_in; - if (var->data.compact) - stride = 1; idx = var->data.driver_location; comp = var->data.location_frac; mode = var->data.mode; get_deref_offset(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), vs_in, NULL, NULL, &const_index, &indir_index); + + if (var->data.compact) { + stride = 1; + const_index += comp; + comp = 0; + } } if (instr->dest.ssa.bit_size == 64) @@ -2006,18 +2017,28 @@ static void visit_store_var(struct ac_nir_context *ctx, nir_intrinsic_instr *instr) { - nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr)); + nir_deref_instr *deref = nir_instr_as_deref(instr->src[0].ssa->parent_instr); + nir_variable *var = nir_deref_instr_get_variable(deref); LLVMValueRef temp_ptr, value; - int idx = var->data.driver_location; - unsigned comp = var->data.location_frac; + int idx = 0; + unsigned comp = 0; LLVMValueRef src = ac_to_float(&ctx->ac, get_src(ctx, instr->src[1])); int writemask = instr->const_index[0]; LLVMValueRef indir_index; unsigned const_index; - get_deref_offset(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), false, - NULL, NULL, &const_index, &indir_index); + if (var) { + get_deref_offset(ctx, deref, false, + NULL, NULL, &const_index, &indir_index); + idx = var->data.driver_location; + comp = var->data.location_frac; + + if (var->data.compact) { + const_index += comp; + comp = 0; + } + } if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src)) == 64) { @@ -2030,7 +2051,7 @@ visit_store_var(struct ac_nir_context *ctx, writemask = writemask << comp; - switch (var->data.mode) { + switch (deref->mode) { case nir_var_shader_out: if (ctx->stage == MESA_SHADER_TESS_CTRL) { @@ -2039,8 +2060,8 @@ visit_store_var(struct ac_nir_context *ctx, unsigned const_index = 0; const bool is_patch = var->data.patch; - get_deref_offset(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), - false, NULL, is_patch ? NULL : &vertex_index, + get_deref_offset(ctx, deref, false, NULL, + is_patch ? NULL : &vertex_index, &const_index, &indir_index); ctx->abi->store_tcs_outputs(ctx->abi, var, @@ -2107,7 +2128,7 @@ visit_store_var(struct ac_nir_context *ctx, int writemask = instr->const_index[0]; LLVMValueRef address = get_src(ctx, instr->src[0]); LLVMValueRef val = get_src(ctx, instr->src[1]); - if (util_is_power_of_two_nonzero(writemask)) { + if (writemask == (1u << ac_get_llvm_num_components(val)) - 1) { val = LLVMBuildBitCast( ctx->ac.builder, val, LLVMGetElementType(LLVMTypeOf(address)), ""); @@ -2338,10 +2359,12 @@ static void get_image_coords(struct ac_nir_context *ctx, } static LLVMValueRef get_image_buffer_descriptor(struct ac_nir_context *ctx, - const nir_intrinsic_instr *instr, bool write) + const nir_intrinsic_instr *instr, + bool write, bool atomic) { LLVMValueRef rsrc = get_image_descriptor(ctx, instr, AC_DESC_BUFFER, write); - if (ctx->abi->gfx9_stride_size_workaround) { + if (ctx->abi->gfx9_stride_size_workaround || + (ctx->abi->gfx9_stride_size_workaround_for_atomic && atomic)) { LLVMValueRef elem_count = LLVMBuildExtractElement(ctx->ac.builder, rsrc, LLVMConstInt(ctx->ac.i32, 2, 0), ""); LLVMValueRef stride = LLVMBuildExtractElement(ctx->ac.builder, rsrc, LLVMConstInt(ctx->ac.i32, 1, 0), ""); stride = LLVMBuildLShr(ctx->ac.builder, stride, LLVMConstInt(ctx->ac.i32, 16, 0), ""); @@ -2374,7 +2397,7 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx, unsigned num_channels = util_last_bit(mask); LLVMValueRef rsrc, vindex; - rsrc = get_image_buffer_descriptor(ctx, instr, false); + rsrc = get_image_buffer_descriptor(ctx, instr, false, false); vindex = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]), ctx->ac.i32_0, ""); @@ -2418,7 +2441,7 @@ static void visit_image_store(struct ac_nir_context *ctx, if (dim == GLSL_SAMPLER_DIM_BUF) { char name[48]; const char *types[] = { "f32", "v2f32", "v4f32" }; - LLVMValueRef rsrc = get_image_buffer_descriptor(ctx, instr, true); + LLVMValueRef rsrc = get_image_buffer_descriptor(ctx, instr, true, false); LLVMValueRef src = ac_to_float(&ctx->ac, get_src(ctx, instr->src[3])); unsigned src_channels = ac_get_llvm_num_components(src); @@ -2514,11 +2537,14 @@ static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx, params[param_count++] = get_src(ctx, instr->src[3]); if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) { - params[param_count++] = get_image_buffer_descriptor(ctx, instr, true); + params[param_count++] = get_image_buffer_descriptor(ctx, instr, true, true); params[param_count++] = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]), ctx->ac.i32_0, ""); /* vindex */ params[param_count++] = ctx->ac.i32_0; /* voffset */ - if (HAVE_LLVM >= 0x800) { + if (HAVE_LLVM >= 0x900) { + /* XXX: The new raw/struct atomic intrinsics are buggy + * with LLVM 8, see r358579. + */ params[param_count++] = ctx->ac.i32_0; /* soffset */ params[param_count++] = ctx->ac.i32_0; /* slc */ @@ -3079,7 +3105,8 @@ static void visit_intrinsic(struct ac_nir_context *ctx, ctx->abi->frag_pos[2], ac_build_fdiv(&ctx->ac, ctx->ac.f32_1, ctx->abi->frag_pos[3]) }; - result = ac_build_gather_values(&ctx->ac, values, 4); + result = ac_to_integer(&ctx->ac, + ac_build_gather_values(&ctx->ac, values, 4)); break; } case nir_intrinsic_load_front_face: @@ -3818,6 +3845,73 @@ static void visit_jump(struct ac_llvm_context *ctx, } } +static LLVMTypeRef +glsl_base_to_llvm_type(struct ac_llvm_context *ac, + enum glsl_base_type type) +{ + switch (type) { + case GLSL_TYPE_INT: + case GLSL_TYPE_UINT: + case GLSL_TYPE_BOOL: + case GLSL_TYPE_SUBROUTINE: + return ac->i32; + case GLSL_TYPE_INT16: + case GLSL_TYPE_UINT16: + return ac->i16; + case GLSL_TYPE_FLOAT: + return ac->f32; + case GLSL_TYPE_FLOAT16: + return ac->f16; + case GLSL_TYPE_INT64: + case GLSL_TYPE_UINT64: + return ac->i64; + case GLSL_TYPE_DOUBLE: + return ac->f64; + default: + unreachable("unknown GLSL type"); + } +} + +static LLVMTypeRef +glsl_to_llvm_type(struct ac_llvm_context *ac, + const struct glsl_type *type) +{ + if (glsl_type_is_scalar(type)) { + return glsl_base_to_llvm_type(ac, glsl_get_base_type(type)); + } + + if (glsl_type_is_vector(type)) { + return LLVMVectorType( + glsl_base_to_llvm_type(ac, glsl_get_base_type(type)), + glsl_get_vector_elements(type)); + } + + if (glsl_type_is_matrix(type)) { + return LLVMArrayType( + glsl_to_llvm_type(ac, glsl_get_column_type(type)), + glsl_get_matrix_columns(type)); + } + + if (glsl_type_is_array(type)) { + return LLVMArrayType( + glsl_to_llvm_type(ac, glsl_get_array_element(type)), + glsl_get_length(type)); + } + + assert(glsl_type_is_struct(type)); + + LLVMTypeRef member_types[glsl_get_length(type)]; + + for (unsigned i = 0; i < glsl_get_length(type); i++) { + member_types[i] = + glsl_to_llvm_type(ac, + glsl_get_struct_field(type, i)); + } + + return LLVMStructTypeInContext(ac->context, member_types, + glsl_get_length(type), false); +} + static void visit_deref(struct ac_nir_context *ctx, nir_deref_instr *instr) { @@ -3839,9 +3933,27 @@ static void visit_deref(struct ac_nir_context *ctx, result = ac_build_gep0(&ctx->ac, get_src(ctx, instr->parent), get_src(ctx, instr->arr.index)); break; - case nir_deref_type_cast: + case nir_deref_type_ptr_as_array: + result = ac_build_gep_ptr(&ctx->ac, get_src(ctx, instr->parent), + get_src(ctx, instr->arr.index)); + break; + case nir_deref_type_cast: { result = get_src(ctx, instr->parent); + + LLVMTypeRef pointee_type = glsl_to_llvm_type(&ctx->ac, instr->type); + LLVMTypeRef type = LLVMPointerType(pointee_type, AC_ADDR_SPACE_LDS); + + if (LLVMTypeOf(result) != type) { + if (LLVMGetTypeKind(LLVMTypeOf(result)) == LLVMVectorTypeKind) { + result = LLVMBuildBitCast(ctx->ac.builder, result, + type, ""); + } else { + result = LLVMBuildIntToPtr(ctx->ac.builder, result, + type, ""); + } + } break; + } default: unreachable("Unhandled deref_instr deref type"); } @@ -3990,73 +4102,6 @@ ac_handle_shader_output_decl(struct ac_llvm_context *ctx, } } -static LLVMTypeRef -glsl_base_to_llvm_type(struct ac_llvm_context *ac, - enum glsl_base_type type) -{ - switch (type) { - case GLSL_TYPE_INT: - case GLSL_TYPE_UINT: - case GLSL_TYPE_BOOL: - case GLSL_TYPE_SUBROUTINE: - return ac->i32; - case GLSL_TYPE_INT16: - case GLSL_TYPE_UINT16: - return ac->i16; - case GLSL_TYPE_FLOAT: - return ac->f32; - case GLSL_TYPE_FLOAT16: - return ac->f16; - case GLSL_TYPE_INT64: - case GLSL_TYPE_UINT64: - return ac->i64; - case GLSL_TYPE_DOUBLE: - return ac->f64; - default: - unreachable("unknown GLSL type"); - } -} - -static LLVMTypeRef -glsl_to_llvm_type(struct ac_llvm_context *ac, - const struct glsl_type *type) -{ - if (glsl_type_is_scalar(type)) { - return glsl_base_to_llvm_type(ac, glsl_get_base_type(type)); - } - - if (glsl_type_is_vector(type)) { - return LLVMVectorType( - glsl_base_to_llvm_type(ac, glsl_get_base_type(type)), - glsl_get_vector_elements(type)); - } - - if (glsl_type_is_matrix(type)) { - return LLVMArrayType( - glsl_to_llvm_type(ac, glsl_get_column_type(type)), - glsl_get_matrix_columns(type)); - } - - if (glsl_type_is_array(type)) { - return LLVMArrayType( - glsl_to_llvm_type(ac, glsl_get_array_element(type)), - glsl_get_length(type)); - } - - assert(glsl_type_is_struct(type)); - - LLVMTypeRef member_types[glsl_get_length(type)]; - - for (unsigned i = 0; i < glsl_get_length(type); i++) { - member_types[i] = - glsl_to_llvm_type(ac, - glsl_get_struct_field(type, i)); - } - - return LLVMStructTypeInContext(ac->context, member_types, - glsl_get_length(type), false); -} - static void setup_locals(struct ac_nir_context *ctx, struct nir_function *func) diff --git a/src/amd/common/ac_shader_abi.h b/src/amd/common/ac_shader_abi.h index ee18e6c1923..9eb4d37257e 100644 --- a/src/amd/common/ac_shader_abi.h +++ b/src/amd/common/ac_shader_abi.h @@ -195,6 +195,7 @@ struct ac_shader_abi { /* Whether to workaround GFX9 ignoring the stride for the buffer size if IDXEN=0 * and LLVM optimizes an indexed load with constant index to IDXEN=0. */ bool gfx9_stride_size_workaround; + bool gfx9_stride_size_workaround_for_atomic; }; #endif /* AC_SHADER_ABI_H */ diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index 7f7f052986e..1271c3e73f2 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -301,7 +301,6 @@ radv_cmd_buffer_destroy(struct radv_cmd_buffer *cmd_buffer) static VkResult radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer) { - cmd_buffer->device->ws->cs_reset(cmd_buffer->cs); list_for_each_entry_safe(struct radv_cmd_buffer_upload, up, @@ -326,6 +325,8 @@ radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer) cmd_buffer->record_result = VK_SUCCESS; + memset(cmd_buffer->vertex_bindings, 0, sizeof(cmd_buffer->vertex_bindings)); + for (unsigned i = 0; i < VK_PIPELINE_BIND_POINT_RANGE_SIZE; i++) { cmd_buffer->descriptors[i].dirty = 0; cmd_buffer->descriptors[i].valid = 0; @@ -338,14 +339,15 @@ radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer) unsigned fence_offset, eop_bug_offset; void *fence_ptr; - radv_cmd_buffer_upload_alloc(cmd_buffer, 8, 0, &fence_offset, + radv_cmd_buffer_upload_alloc(cmd_buffer, 8, 8, &fence_offset, &fence_ptr); + cmd_buffer->gfx9_fence_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); cmd_buffer->gfx9_fence_va += fence_offset; /* Allocate a buffer for the EOP bug on GFX9. */ - radv_cmd_buffer_upload_alloc(cmd_buffer, 16 * num_db, 0, + radv_cmd_buffer_upload_alloc(cmd_buffer, 16 * num_db, 8, &eop_bug_offset, &fence_ptr); cmd_buffer->gfx9_eop_bug_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); @@ -416,6 +418,8 @@ radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer *cmd_buffer, unsigned *out_offset, void **ptr) { + assert(util_is_power_of_two_nonzero(alignment)); + uint64_t offset = align(cmd_buffer->upload.offset, alignment); if (offset + size > cmd_buffer->upload.size) { if (!radv_cmd_buffer_resize_upload_buf(cmd_buffer, size)) @@ -1255,7 +1259,7 @@ radv_set_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT) ++reg_count; - radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + reg_count, 0)); + radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + reg_count, cmd_buffer->state.predicating)); radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP)); @@ -1279,7 +1283,7 @@ radv_set_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer, uint64_t va = radv_buffer_get_va(image->bo); va += image->offset + image->tc_compat_zrange_offset; - radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0)); + radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, cmd_buffer->state.predicating)); radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP)); @@ -1356,7 +1360,7 @@ radv_load_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, uint32_t reg = R_028028_DB_STENCIL_CLEAR + 4 * reg_offset; - if (cmd_buffer->device->physical_device->rad_info.chip_class >= VI) { + if (cmd_buffer->device->physical_device->has_load_ctx_reg_pkt) { radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG, 3, 0)); radeon_emit(cs, va); radeon_emit(cs, va >> 32); @@ -1473,7 +1477,7 @@ radv_set_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, assert(radv_image_has_cmask(image) || radv_image_has_dcc(image)); - radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 4, 0)); + radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 4, cmd_buffer->state.predicating)); radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP)); @@ -1518,14 +1522,13 @@ radv_load_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, uint32_t reg = R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c; - if (cmd_buffer->device->physical_device->rad_info.chip_class >= VI) { + if (cmd_buffer->device->physical_device->has_load_ctx_reg_pkt) { radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG, 3, cmd_buffer->state.predicating)); radeon_emit(cs, va); radeon_emit(cs, va >> 32); radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2); radeon_emit(cs, 2); } else { - /* TODO: Figure out how to use LOAD_CONTEXT_REG on SI/CIK. */ radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating)); radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) | @@ -2155,6 +2158,7 @@ radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer, ia_multi_vgt_param = si_get_ia_multi_vgt_param(cmd_buffer, draw_info->instance_count > 1, draw_info->indirect, + !!draw_info->strmout_buffer, draw_info->indirect ? 0 : draw_info->count); if (state->last_ia_multi_vgt_param != ia_multi_vgt_param) { @@ -4404,10 +4408,15 @@ static void radv_handle_depth_image_transition(struct radv_cmd_buffer *cmd_buffe if (!radv_image_has_htile(image)) return; - if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED && - radv_layout_has_htile(image, dst_layout, dst_queue_mask)) { - /* TODO: merge with the clear if applicable */ - radv_initialize_htile(cmd_buffer, image, range, 0); + if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) { + uint32_t clear_value = vk_format_is_stencil(image->vk_format) ? 0xfffff30f : 0xfffc000f; + + if (radv_layout_is_htile_compressed(image, dst_layout, + dst_queue_mask)) { + clear_value = 0; + } + + radv_initialize_htile(cmd_buffer, image, range, clear_value); } else if (!radv_layout_is_htile_compressed(image, src_layout, src_queue_mask) && radv_layout_is_htile_compressed(image, dst_layout, dst_queue_mask)) { uint32_t clear_value = vk_format_is_stencil(image->vk_format) ? 0xfffff30f : 0xfffc000f; @@ -4837,8 +4846,11 @@ void radv_CmdBeginConditionalRenderingEXT( { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); RADV_FROM_HANDLE(radv_buffer, buffer, pConditionalRenderingBegin->buffer); + struct radeon_cmdbuf *cs = cmd_buffer->cs; bool draw_visible = true; - uint64_t va; + uint64_t pred_value = 0; + uint64_t va, new_va; + unsigned pred_offset; va = radv_buffer_get_va(buffer->bo) + pConditionalRenderingBegin->offset; @@ -4854,13 +4866,51 @@ void radv_CmdBeginConditionalRenderingEXT( si_emit_cache_flush(cmd_buffer); + /* From the Vulkan spec 1.1.107: + * + * "If the 32-bit value at offset in buffer memory is zero, then the + * rendering commands are discarded, otherwise they are executed as + * normal. If the value of the predicate in buffer memory changes while + * conditional rendering is active, the rendering commands may be + * discarded in an implementation-dependent way. Some implementations + * may latch the value of the predicate upon beginning conditional + * rendering while others may read it before every rendering command." + * + * But, the AMD hardware treats the predicate as a 64-bit value which + * means we need a workaround in the driver. Luckily, it's not required + * to support if the value changes when predication is active. + * + * The workaround is as follows: + * 1) allocate a 64-value in the upload BO and initialize it to 0 + * 2) copy the 32-bit predicate value to the upload BO + * 3) use the new allocated VA address for predication + * + * Based on the conditionalrender demo, it's faster to do the COPY_DATA + * in ME (+ sync PFP) instead of PFP. + */ + radv_cmd_buffer_upload_data(cmd_buffer, 8, 16, &pred_value, &pred_offset); + + new_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset; + + radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); + radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | + COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | + COPY_DATA_WR_CONFIRM); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + radeon_emit(cs, new_va); + radeon_emit(cs, new_va >> 32); + + radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); + radeon_emit(cs, 0); + /* Enable predication for this command buffer. */ - si_emit_set_predication_state(cmd_buffer, draw_visible, va); + si_emit_set_predication_state(cmd_buffer, draw_visible, new_va); cmd_buffer->state.predicating = true; /* Store conditional rendering user info. */ cmd_buffer->state.predication_type = draw_visible; - cmd_buffer->state.predication_va = va; + cmd_buffer->state.predication_va = new_va; } void radv_CmdEndConditionalRenderingEXT( @@ -4904,7 +4954,7 @@ void radv_CmdBindTransformFeedbackBuffersEXT( enabled_mask |= 1 << idx; } - cmd_buffer->state.streamout.enabled_mask = enabled_mask; + cmd_buffer->state.streamout.enabled_mask |= enabled_mask; cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_BUFFER; } diff --git a/src/amd/vulkan/radv_debug.h b/src/amd/vulkan/radv_debug.h index 01712bd22ce..ac93434b8bd 100644 --- a/src/amd/vulkan/radv_debug.h +++ b/src/amd/vulkan/radv_debug.h @@ -51,6 +51,7 @@ enum { RADV_DEBUG_CHECKIR = 0x200000, RADV_DEBUG_NOTHREADLLVM = 0x400000, RADV_DEBUG_NOBINNING = 0x800000, + RADV_DEBUG_NO_LOAD_STORE_OPT = 0x1000000, }; enum { diff --git a/src/amd/vulkan/radv_descriptor_set.c b/src/amd/vulkan/radv_descriptor_set.c index cebe06aa078..68171b5d244 100644 --- a/src/amd/vulkan/radv_descriptor_set.c +++ b/src/amd/vulkan/radv_descriptor_set.c @@ -84,7 +84,9 @@ VkResult radv_CreateDescriptorSetLayout( uint32_t immutable_sampler_count = 0; for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) { max_binding = MAX2(max_binding, pCreateInfo->pBindings[j].binding); - if (pCreateInfo->pBindings[j].pImmutableSamplers) + if ((pCreateInfo->pBindings[j].descriptorType == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER || + pCreateInfo->pBindings[j].descriptorType == VK_DESCRIPTOR_TYPE_SAMPLER) && + pCreateInfo->pBindings[j].pImmutableSamplers) immutable_sampler_count += pCreateInfo->pBindings[j].descriptorCount; } @@ -182,7 +184,9 @@ VkResult radv_CreateDescriptorSetLayout( set_layout->has_variable_descriptors = true; } - if (binding->pImmutableSamplers) { + if ((binding->descriptorType == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER || + binding->descriptorType == VK_DESCRIPTOR_TYPE_SAMPLER) && + binding->pImmutableSamplers) { set_layout->binding[b].immutable_samplers_offset = samplers_offset; set_layout->binding[b].immutable_samplers_equal = has_equal_immutable_samplers(binding->pImmutableSamplers, binding->descriptorCount); @@ -965,9 +969,11 @@ void radv_update_descriptor_sets( } src_ptr += src_binding_layout->size / 4; dst_ptr += dst_binding_layout->size / 4; - dst_buffer_list[j] = src_buffer_list[j]; - ++src_buffer_list; - ++dst_buffer_list; + + if (src_binding_layout->type != VK_DESCRIPTOR_TYPE_SAMPLER) { + /* Sampler descriptors don't have a buffer list. */ + dst_buffer_list[j] = src_buffer_list[j]; + } } } } diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index 34d93b262f8..334c8bd4548 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -111,6 +111,7 @@ radv_get_device_name(enum radeon_family family, char *name, size_t name_len) case CHIP_VEGAM: chip_string = "AMD RADV VEGA M"; break; case CHIP_VEGA10: chip_string = "AMD RADV VEGA10"; break; case CHIP_VEGA12: chip_string = "AMD RADV VEGA12"; break; + case CHIP_VEGA20: chip_string = "AMD RADV VEGA20"; break; case CHIP_RAVEN: chip_string = "AMD RADV RAVEN"; break; case CHIP_RAVEN2: chip_string = "AMD RADV RAVEN2"; break; default: chip_string = "AMD RADV unknown"; break; @@ -337,7 +338,7 @@ radv_physical_device_init(struct radv_physical_device *device, device->rad_info.chip_class > GFX9) fprintf(stderr, "WARNING: radv is not a conformant vulkan implementation, testing use only.\n"); - radv_get_driver_uuid(&device->device_uuid); + radv_get_driver_uuid(&device->driver_uuid); radv_get_device_uuid(&device->rad_info, &device->device_uuid); if (device->rad_info.family == CHIP_STONEY || @@ -369,6 +370,11 @@ radv_physical_device_init(struct radv_physical_device *device, device->dcc_msaa_allowed = (device->instance->perftest_flags & RADV_PERFTEST_DCC_MSAA); + /* TODO: Figure out how to use LOAD_CONTEXT_REG on SI/CIK. */ + device->has_load_ctx_reg_pkt = device->rad_info.chip_class >= GFX9 || + (device->rad_info.chip_class >= VI && + device->rad_info.me_fw_feature >= 41); + radv_physical_device_init_mem_types(device); radv_fill_device_extension_table(device, &device->supported_extensions); @@ -460,6 +466,7 @@ static const struct debug_control radv_debug_options[] = { {"checkir", RADV_DEBUG_CHECKIR}, {"nothreadllvm", RADV_DEBUG_NOTHREADLLVM}, {"nobinning", RADV_DEBUG_NOBINNING}, + {"noloadstoreopt", RADV_DEBUG_NO_LOAD_STORE_OPT}, {NULL, 0} }; @@ -505,6 +512,13 @@ radv_handle_per_app_options(struct radv_instance *instance, } else if (!strcmp(name, "DOOM_VFR")) { /* Work around a Doom VFR game bug */ instance->debug_flags |= RADV_DEBUG_NO_DYNAMIC_BOUNDS; + } else if (!strcmp(name, "MonsterHunterWorld.exe")) { + /* Workaround for a WaW hazard when LLVM moves/merges + * load/store memory operations. + * See https://reviews.llvm.org/D61313 + */ + if (HAVE_LLVM < 0x900) + instance->debug_flags |= RADV_DEBUG_NO_LOAD_STORE_OPT; } } @@ -734,8 +748,7 @@ void radv_GetPhysicalDeviceFeatures( .alphaToOne = true, .multiViewport = true, .samplerAnisotropy = true, - .textureCompressionETC2 = pdevice->rad_info.chip_class >= GFX9 || - pdevice->rad_info.family == CHIP_STONEY, + .textureCompressionETC2 = radv_device_supports_etc(pdevice), .textureCompressionASTC_LDR = false, .textureCompressionBC = true, .occlusionQueryPrecise = true, @@ -802,7 +815,7 @@ void radv_GetPhysicalDeviceFeatures2( features->storageBuffer16BitAccess = enabled; features->uniformAndStorageBuffer16BitAccess = enabled; features->storagePushConstant16 = enabled; - features->storageInputOutput16 = enabled; + features->storageInputOutput16 = enabled && HAVE_LLVM >= 0x900; break; } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_YCBCR_CONVERSION_FEATURES: { @@ -998,7 +1011,7 @@ void radv_GetPhysicalDeviceProperties( .maxCullDistances = 8, .maxCombinedClipAndCullDistances = 8, .discreteQueuePriorities = 2, - .pointSizeRange = { 0.125, 255.875 }, + .pointSizeRange = { 0.0, 8192.0 }, .lineWidthRange = { 0.0, 7.9921875 }, .pointSizeGranularity = (1.0 / 8.0), .lineWidthGranularity = (1.0 / 128.0), @@ -2790,7 +2803,7 @@ VkResult radv_QueueSubmit( struct radeon_winsys_fence *base_fence = fence ? fence->fence : NULL; struct radeon_winsys_ctx *ctx = queue->hw_ctx; int ret; - uint32_t max_cs_submission = queue->device->trace_bo ? 1 : UINT32_MAX; + uint32_t max_cs_submission = queue->device->trace_bo ? 1 : RADV_MAX_IBS_PER_SUBMIT; uint32_t scratch_size = 0; uint32_t compute_scratch_size = 0; uint32_t esgs_ring_size = 0, gsvs_ring_size = 0; diff --git a/src/amd/vulkan/radv_extensions.py b/src/amd/vulkan/radv_extensions.py index 1bf56943f25..187c0ba574d 100644 --- a/src/amd/vulkan/radv_extensions.py +++ b/src/amd/vulkan/radv_extensions.py @@ -100,7 +100,7 @@ def __init__(self, name, ext_version, enable): Extension('VK_EXT_display_control', 1, 'VK_USE_PLATFORM_DISPLAY_KHR'), Extension('VK_EXT_debug_report', 9, True), Extension('VK_EXT_depth_range_unrestricted', 1, True), - Extension('VK_EXT_descriptor_indexing', 2, True), + Extension('VK_EXT_descriptor_indexing', 2, False), Extension('VK_EXT_discard_rectangles', 1, True), Extension('VK_EXT_external_memory_dma_buf', 1, True), Extension('VK_EXT_external_memory_host', 1, 'device->rad_info.has_userptr'), diff --git a/src/amd/vulkan/radv_formats.c b/src/amd/vulkan/radv_formats.c index 499d94befeb..9c61e769ebd 100644 --- a/src/amd/vulkan/radv_formats.c +++ b/src/amd/vulkan/radv_formats.c @@ -595,6 +595,14 @@ static bool radv_is_filter_minmax_format_supported(VkFormat format) } } +bool +radv_device_supports_etc(struct radv_physical_device *physical_device) +{ + return physical_device->rad_info.family == CHIP_VEGA10 || + physical_device->rad_info.family == CHIP_RAVEN || + physical_device->rad_info.family == CHIP_STONEY; +} + static void radv_physical_device_get_format_properties(struct radv_physical_device *physical_device, VkFormat format, @@ -612,9 +620,7 @@ radv_physical_device_get_format_properties(struct radv_physical_device *physical } if (desc->layout == VK_FORMAT_LAYOUT_ETC && - physical_device->rad_info.family != CHIP_VEGA10 && - physical_device->rad_info.family != CHIP_RAVEN && - physical_device->rad_info.family != CHIP_STONEY) { + !radv_device_supports_etc(physical_device)) { out_properties->linearTilingFeatures = linear; out_properties->optimalTilingFeatures = tiled; out_properties->bufferFeatures = buffer; @@ -984,10 +990,22 @@ bool radv_format_pack_clear_color(VkFormat format, assert(channel->size == 8); v = util_format_linear_float_to_srgb_8unorm(value->float32[c]); - } else if (channel->type == VK_FORMAT_TYPE_UNSIGNED) { - v = MAX2(MIN2(value->float32[c], 1.0f), 0.0f) * ((1ULL << channel->size) - 1); - } else { - v = MAX2(MIN2(value->float32[c], 1.0f), -1.0f) * ((1ULL << (channel->size - 1)) - 1); + } else { + float f = MIN2(value->float32[c], 1.0f); + + if (channel->type == VK_FORMAT_TYPE_UNSIGNED) { + f = MAX2(f, 0.0f) * ((1ULL << channel->size) - 1); + } else { + f = MAX2(f, -1.0f) * ((1ULL << (channel->size - 1)) - 1); + } + + /* The hardware rounds before conversion. */ + if (f > 0) + f += 0.5f; + else + f -= 0.5f; + + v = (uint64_t)f; } } else if (channel->type == VK_FORMAT_TYPE_FLOAT) { if (channel->size == 32) { diff --git a/src/amd/vulkan/radv_meta_blit.c b/src/amd/vulkan/radv_meta_blit.c index ef690edb471..f3a8f6464b8 100644 --- a/src/amd/vulkan/radv_meta_blit.c +++ b/src/amd/vulkan/radv_meta_blit.c @@ -849,54 +849,60 @@ build_pipeline(struct radv_device *device, .subpass = 0, }; - switch(aspect) { - case VK_IMAGE_ASPECT_COLOR_BIT: - vk_pipeline_info.pColorBlendState = &(VkPipelineColorBlendStateCreateInfo) { - .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO, - .attachmentCount = 1, - .pAttachments = (VkPipelineColorBlendAttachmentState []) { - { .colorWriteMask = - VK_COLOR_COMPONENT_A_BIT | - VK_COLOR_COMPONENT_R_BIT | - VK_COLOR_COMPONENT_G_BIT | - VK_COLOR_COMPONENT_B_BIT }, + VkPipelineColorBlendStateCreateInfo color_blend_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO, + .attachmentCount = 1, + .pAttachments = (VkPipelineColorBlendAttachmentState []) { + { + .colorWriteMask = VK_COLOR_COMPONENT_A_BIT | + VK_COLOR_COMPONENT_R_BIT | + VK_COLOR_COMPONENT_G_BIT | + VK_COLOR_COMPONENT_B_BIT }, } }; + + VkPipelineDepthStencilStateCreateInfo depth_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO, + .depthTestEnable = true, + .depthWriteEnable = true, + .depthCompareOp = VK_COMPARE_OP_ALWAYS, + }; + + VkPipelineDepthStencilStateCreateInfo stencil_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO, + .depthTestEnable = false, + .depthWriteEnable = false, + .stencilTestEnable = true, + .front = { + .failOp = VK_STENCIL_OP_REPLACE, + .passOp = VK_STENCIL_OP_REPLACE, + .depthFailOp = VK_STENCIL_OP_REPLACE, + .compareOp = VK_COMPARE_OP_ALWAYS, + .compareMask = 0xff, + .writeMask = 0xff, + .reference = 0 + }, + .back = { + .failOp = VK_STENCIL_OP_REPLACE, + .passOp = VK_STENCIL_OP_REPLACE, + .depthFailOp = VK_STENCIL_OP_REPLACE, + .compareOp = VK_COMPARE_OP_ALWAYS, + .compareMask = 0xff, + .writeMask = 0xff, + .reference = 0 + }, + .depthCompareOp = VK_COMPARE_OP_ALWAYS, + }; + + switch(aspect) { + case VK_IMAGE_ASPECT_COLOR_BIT: + vk_pipeline_info.pColorBlendState = &color_blend_info; break; case VK_IMAGE_ASPECT_DEPTH_BIT: - vk_pipeline_info.pDepthStencilState = &(VkPipelineDepthStencilStateCreateInfo) { - .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO, - .depthTestEnable = true, - .depthWriteEnable = true, - .depthCompareOp = VK_COMPARE_OP_ALWAYS, - }; + vk_pipeline_info.pDepthStencilState = &depth_info; break; case VK_IMAGE_ASPECT_STENCIL_BIT: - vk_pipeline_info.pDepthStencilState = &(VkPipelineDepthStencilStateCreateInfo) { - .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO, - .depthTestEnable = false, - .depthWriteEnable = false, - .stencilTestEnable = true, - .front = { - .failOp = VK_STENCIL_OP_REPLACE, - .passOp = VK_STENCIL_OP_REPLACE, - .depthFailOp = VK_STENCIL_OP_REPLACE, - .compareOp = VK_COMPARE_OP_ALWAYS, - .compareMask = 0xff, - .writeMask = 0xff, - .reference = 0 - }, - .back = { - .failOp = VK_STENCIL_OP_REPLACE, - .passOp = VK_STENCIL_OP_REPLACE, - .depthFailOp = VK_STENCIL_OP_REPLACE, - .compareOp = VK_COMPARE_OP_ALWAYS, - .compareMask = 0xff, - .writeMask = 0xff, - .reference = 0 - }, - .depthCompareOp = VK_COMPARE_OP_ALWAYS, - }; + vk_pipeline_info.pDepthStencilState = &stencil_info; break; default: unreachable("Unhandled aspect"); diff --git a/src/amd/vulkan/radv_meta_clear.c b/src/amd/vulkan/radv_meta_clear.c index 8805f0435e1..32af736fd8f 100644 --- a/src/amd/vulkan/radv_meta_clear.c +++ b/src/amd/vulkan/radv_meta_clear.c @@ -370,14 +370,29 @@ emit_color_clear(struct radv_cmd_buffer *cmd_buffer, const struct radv_framebuffer *fb = cmd_buffer->state.framebuffer; const uint32_t subpass_att = clear_att->colorAttachment; const uint32_t pass_att = subpass->color_attachments[subpass_att].attachment; - const struct radv_image_view *iview = fb->attachments[pass_att].attachment; - const uint32_t samples = iview->image->info.samples; - const uint32_t samples_log2 = ffs(samples) - 1; - unsigned fs_key = radv_format_meta_fs_key(iview->vk_format); + const struct radv_image_view *iview = fb ? fb->attachments[pass_att].attachment : NULL; + uint32_t samples, samples_log2; + VkFormat format; + unsigned fs_key; VkClearColorValue clear_value = clear_att->clearValue.color; VkCommandBuffer cmd_buffer_h = radv_cmd_buffer_to_handle(cmd_buffer); VkPipeline pipeline; + /* When a framebuffer is bound to the current command buffer, get the + * number of samples from it. Otherwise, get the number of samples from + * the render pass because it's likely a secondary command buffer. + */ + if (iview) { + samples = iview->image->info.samples; + format = iview->vk_format; + } else { + samples = cmd_buffer->state.pass->attachments[pass_att].samples; + format = cmd_buffer->state.pass->attachments[pass_att].format; + } + + samples_log2 = ffs(samples) - 1; + fs_key = radv_format_meta_fs_key(format); + if (fs_key == -1) { radv_finishme("color clears incomplete"); return; @@ -617,6 +632,9 @@ static bool depth_view_can_fast_clear(struct radv_cmd_buffer *cmd_buffer, const VkClearRect *clear_rect, VkClearDepthStencilValue clear_value) { + if (!iview) + return false; + uint32_t queue_mask = radv_image_queue_family_mask(iview->image, cmd_buffer->queue_family_index, cmd_buffer->queue_family_index); @@ -633,7 +651,7 @@ static bool depth_view_can_fast_clear(struct radv_cmd_buffer *cmd_buffer, iview->base_mip == 0 && iview->base_layer == 0 && radv_layout_is_htile_compressed(iview->image, layout, queue_mask) && - !radv_image_extent_compare(iview->image, &iview->extent)) + radv_image_extent_compare(iview->image, &iview->extent)) return true; return false; } @@ -705,11 +723,22 @@ emit_depthstencil_clear(struct radv_cmd_buffer *cmd_buffer, const uint32_t pass_att = subpass->depth_stencil_attachment.attachment; VkClearDepthStencilValue clear_value = clear_att->clearValue.depthStencil; VkImageAspectFlags aspects = clear_att->aspectMask; - const struct radv_image_view *iview = fb->attachments[pass_att].attachment; - const uint32_t samples = iview->image->info.samples; - const uint32_t samples_log2 = ffs(samples) - 1; + const struct radv_image_view *iview = fb ? fb->attachments[pass_att].attachment : NULL; + uint32_t samples, samples_log2; VkCommandBuffer cmd_buffer_h = radv_cmd_buffer_to_handle(cmd_buffer); + /* When a framebuffer is bound to the current command buffer, get the + * number of samples from it. Otherwise, get the number of samples from + * the render pass because it's likely a secondary command buffer. + */ + if (iview) { + samples = iview->image->info.samples; + } else { + samples = cmd_buffer->state.pass->attachments[pass_att].samples; + } + + samples_log2 = ffs(samples) - 1; + assert(pass_att != VK_ATTACHMENT_UNUSED); if (!(aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) @@ -915,7 +944,11 @@ static bool radv_image_view_can_fast_clear(struct radv_device *device, const struct radv_image_view *iview) { - struct radv_image *image = iview->image; + struct radv_image *image; + + if (!iview) + return false; + image = iview->image; /* Only fast clear if the image itself can be fast cleared. */ if (!radv_image_can_fast_clear(device, image)) @@ -1523,7 +1556,7 @@ emit_clear(struct radv_cmd_buffer *cmd_buffer, const uint32_t subpass_att = clear_att->colorAttachment; const uint32_t pass_att = subpass->color_attachments[subpass_att].attachment; VkImageLayout image_layout = subpass->color_attachments[subpass_att].layout; - const struct radv_image_view *iview = fb->attachments[pass_att].attachment; + const struct radv_image_view *iview = fb ? fb->attachments[pass_att].attachment : NULL; VkClearColorValue clear_value = clear_att->clearValue.color; if (radv_can_fast_clear_color(cmd_buffer, iview, image_layout, @@ -1536,8 +1569,11 @@ emit_clear(struct radv_cmd_buffer *cmd_buffer, } } else { const uint32_t pass_att = subpass->depth_stencil_attachment.attachment; + if (pass_att == VK_ATTACHMENT_UNUSED) + return; + VkImageLayout image_layout = subpass->depth_stencil_attachment.layout; - const struct radv_image_view *iview = fb->attachments[pass_att].attachment; + const struct radv_image_view *iview = fb ? fb->attachments[pass_att].attachment : NULL; VkClearDepthStencilValue clear_value = clear_att->clearValue.depthStencil; assert(aspects & (VK_IMAGE_ASPECT_DEPTH_BIT | diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index e80938527e5..00d65de8164 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -92,6 +92,7 @@ struct radv_shader_context { gl_shader_stage stage; LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS * 4]; + uint64_t float16_shaded_mask; uint64_t input_mask; uint64_t output_mask; @@ -1441,7 +1442,7 @@ store_tcs_output(struct ac_shader_abi *abi, { struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); const unsigned location = var->data.location; - const unsigned component = var->data.location_frac; + unsigned component = var->data.location_frac; const bool is_patch = var->data.patch; const bool is_compact = var->data.compact; LLVMValueRef dw_addr; @@ -1459,10 +1460,14 @@ store_tcs_output(struct ac_shader_abi *abi, } param = shader_io_get_unique_index(location); - if (location == VARYING_SLOT_CLIP_DIST0 && - is_compact && const_index > 3) { - const_index -= 3; - param++; + if ((location == VARYING_SLOT_CLIP_DIST0 || location == VARYING_SLOT_CLIP_DIST1) && is_compact) { + const_index += component; + component = 0; + + if (const_index >= 4) { + const_index -= 4; + param++; + } } if (!is_patch) { @@ -1529,9 +1534,13 @@ load_tes_input(struct ac_shader_abi *abi, LLVMValueRef result; unsigned param = shader_io_get_unique_index(location); - if (location == VARYING_SLOT_CLIP_DIST0 && is_compact && const_index > 3) { - const_index -= 3; - param++; + if ((location == VARYING_SLOT_CLIP_DIST0 || location == VARYING_SLOT_CLIP_DIST1) && is_compact) { + const_index += component; + component = 0; + if (const_index >= 4) { + const_index -= 4; + param++; + } } buf_addr = get_tcs_tes_buffer_address_params(ctx, param, const_index, @@ -2018,10 +2027,32 @@ handle_vs_input_decl(struct radv_shader_context *ctx, t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset); - input = ac_build_buffer_load_format(&ctx->ac, t_list, - buffer_index, - ctx->ac.i32_0, - num_channels, false, true); + if (ctx->options->key.vs.vertex_attribute_provided & (1u << attrib_index)) { + input = ac_build_buffer_load_format(&ctx->ac, t_list, + buffer_index, + ctx->ac.i32_0, + num_channels, false, true); + } else { + /* Per the Vulkan spec, it's invalid to consume vertex + * attributes that are not provided by the pipeline but + * some (invalid) apps appear to do that. Fill the + * input array with (eg. (0, 0, 0, 1)) to workaround + * the problem and to avoid possible GPU hangs. + */ + LLVMValueRef chan[4]; + + /* The input_usage mask might be 0 if input variables + * are not removed by the compiler. + */ + num_channels = CLAMP(num_channels, 1, 4); + + for (unsigned i = 0; i < num_channels; i++) { + chan[i] = i == 3 ? ctx->ac.f32_1 : ctx->ac.f32_0; + chan[i] = ac_to_float(&ctx->ac, chan[i]); + } + + input = ac_build_gather_values(&ctx->ac, chan, num_channels); + } input = ac_build_expand_to_vec4(&ctx->ac, input, num_channels); @@ -2051,6 +2082,7 @@ static void interp_fs_input(struct radv_shader_context *ctx, unsigned attr, LLVMValueRef interp_param, LLVMValueRef prim_mask, + bool float16, LLVMValueRef result[4]) { LLVMValueRef attr_number; @@ -2083,7 +2115,12 @@ static void interp_fs_input(struct radv_shader_context *ctx, for (chan = 0; chan < 4; chan++) { LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false); - if (interp) { + if (interp && float16) { + result[chan] = ac_build_fs_interp_f16(&ctx->ac, + llvm_chan, + attr_number, + prim_mask, i, j); + } else if (interp) { result[chan] = ac_build_fs_interp(&ctx->ac, llvm_chan, attr_number, @@ -2095,7 +2132,30 @@ static void interp_fs_input(struct radv_shader_context *ctx, attr_number, prim_mask); result[chan] = LLVMBuildBitCast(ctx->ac.builder, result[chan], ctx->ac.i32, ""); - result[chan] = LLVMBuildTruncOrBitCast(ctx->ac.builder, result[chan], LLVMTypeOf(interp_param), ""); + result[chan] = LLVMBuildTruncOrBitCast(ctx->ac.builder, result[chan], float16 ? ctx->ac.i16 : ctx->ac.i32, ""); + } + } +} + +static void mark_16bit_fs_input(struct radv_shader_context *ctx, + const struct glsl_type *type, + int location) +{ + if (glsl_type_is_scalar(type) || glsl_type_is_vector(type) || glsl_type_is_matrix(type)) { + unsigned attrib_count = glsl_count_attribute_slots(type, false); + if (glsl_type_is_16bit(type)) { + ctx->float16_shaded_mask |= ((1ull << attrib_count) - 1) << location; + } + } else if (glsl_type_is_array(type)) { + unsigned stride = glsl_count_attribute_slots(glsl_get_array_element(type), false); + for (unsigned i = 0; i < glsl_get_length(type); ++i) { + mark_16bit_fs_input(ctx, glsl_get_array_element(type), location + i * stride); + } + } else { + assert(glsl_type_is_struct(type)); + for (unsigned i = 0; i < glsl_get_length(type); i++) { + mark_16bit_fs_input(ctx, glsl_get_struct_field(type, i), location); + location += glsl_count_attribute_slots(glsl_get_struct_field(type, i), false); } } } @@ -2110,9 +2170,20 @@ handle_fs_input_decl(struct radv_shader_context *ctx, uint64_t mask; variable->data.driver_location = idx * 4; + + + if (variable->data.compact) { + unsigned component_count = variable->data.location_frac + + glsl_get_length(variable->type); + attrib_count = (component_count + 3) / 4; + } else + mark_16bit_fs_input(ctx, variable->type, idx); + mask = ((1ull << attrib_count) - 1) << variable->data.location; - if (glsl_get_base_type(glsl_without_array(variable->type)) == GLSL_TYPE_FLOAT) { + if (glsl_get_base_type(glsl_without_array(variable->type)) == GLSL_TYPE_FLOAT || + glsl_get_base_type(glsl_without_array(variable->type)) == GLSL_TYPE_FLOAT16 || + glsl_get_base_type(glsl_without_array(variable->type)) == GLSL_TYPE_STRUCT) { unsigned interp_type; if (variable->data.sample) interp_type = INTERP_SAMPLE; @@ -2123,22 +2194,12 @@ handle_fs_input_decl(struct radv_shader_context *ctx, interp = lookup_interp_param(&ctx->abi, variable->data.interpolation, interp_type); } - bool is_16bit = glsl_type_is_16bit(glsl_without_array(variable->type)); - LLVMTypeRef type = is_16bit ? ctx->ac.i16 : ctx->ac.i32; if (interp == NULL) - interp = LLVMGetUndef(type); + interp = LLVMGetUndef(ctx->ac.i32); for (unsigned i = 0; i < attrib_count; ++i) ctx->inputs[ac_llvm_reg_index_soa(idx + i, 0)] = interp; - if (idx == VARYING_SLOT_CLIP_DIST0) { - /* Do not account for the number of components inside the array - * of clip/cull distances because this might wrongly set other - * bits like primitive ID or layer. - */ - mask = 1ull << VARYING_SLOT_CLIP_DIST0; - } - ctx->input_mask |= mask; } @@ -2200,11 +2261,14 @@ handle_fs_inputs(struct radv_shader_context *ctx, if (i >= VARYING_SLOT_VAR0 || i == VARYING_SLOT_PNTC || i == VARYING_SLOT_PRIMITIVE_ID || i == VARYING_SLOT_LAYER) { interp_param = *inputs; - interp_fs_input(ctx, index, interp_param, ctx->abi.prim_mask, + bool float16 = (ctx->float16_shaded_mask >> i) & 1; + interp_fs_input(ctx, index, interp_param, ctx->abi.prim_mask, float16, inputs); if (LLVMIsUndef(interp_param)) ctx->shader_info->fs.flat_shaded_mask |= 1u << index; + if (float16) + ctx->shader_info->fs.float16_shaded_mask |= 1u << index; if (i >= VARYING_SLOT_VAR0) ctx->abi.fs_input_attr_indices[i - VARYING_SLOT_VAR0] = index; ++index; @@ -2216,7 +2280,7 @@ handle_fs_inputs(struct radv_shader_context *ctx, interp_param = *inputs; interp_fs_input(ctx, index, interp_param, - ctx->abi.prim_mask, inputs); + ctx->abi.prim_mask, false, inputs); ++index; } } else if (i == VARYING_SLOT_POS) { @@ -2250,6 +2314,12 @@ scan_shader_output_decl(struct radv_shader_context *ctx, if (stage == MESA_SHADER_TESS_CTRL) return; + if (variable->data.compact) { + unsigned component_count = variable->data.location_frac + + glsl_get_length(variable->type); + attrib_count = (component_count + 3) / 4; + } + mask_attribs = ((1ull << attrib_count) - 1) << idx; if (stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL || @@ -2265,8 +2335,6 @@ scan_shader_output_decl(struct radv_shader_context *ctx, ctx->shader_info->tes.outinfo.cull_dist_mask = (1 << shader->info.cull_distance_array_size) - 1; ctx->shader_info->tes.outinfo.cull_dist_mask <<= shader->info.clip_distance_array_size; } - - mask_attribs = 1ull << idx; } } @@ -2365,7 +2433,7 @@ si_llvm_init_export_args(struct radv_shader_context *ctx, if (is_16bit) { for (unsigned chan = 0; chan < 4; chan++) values[chan] = LLVMBuildZExt(ctx->ac.builder, - values[chan], + ac_to_integer(&ctx->ac, values[chan]), ctx->ac.i32, ""); } break; @@ -2376,7 +2444,7 @@ si_llvm_init_export_args(struct radv_shader_context *ctx, if (is_16bit) { for (unsigned chan = 0; chan < 4; chan++) values[chan] = LLVMBuildSExt(ctx->ac.builder, - values[chan], + ac_to_integer(&ctx->ac, values[chan]), ctx->ac.i32, ""); } break; @@ -2429,12 +2497,8 @@ si_llvm_init_export_args(struct radv_shader_context *ctx, } else memcpy(&args->out[0], values, sizeof(values[0]) * 4); - for (unsigned i = 0; i < 4; ++i) { - if (!(args->enabled_channels & (1 << i))) - continue; - + for (unsigned i = 0; i < 4; ++i) args->out[i] = ac_to_float(&ctx->ac, args->out[i]); - } } static void @@ -2615,51 +2679,41 @@ handle_vs_outputs_post(struct radv_shader_context *ctx, memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED, sizeof(outinfo->vs_output_param_offset)); - if (ctx->output_mask & (1ull << VARYING_SLOT_CLIP_DIST0)) { - unsigned output_usage_mask, length; - LLVMValueRef slots[8]; - unsigned j; - - if (ctx->stage == MESA_SHADER_VERTEX && - !ctx->is_gs_copy_shader) { - output_usage_mask = - ctx->shader_info->info.vs.output_usage_mask[VARYING_SLOT_CLIP_DIST0]; - } else if (ctx->stage == MESA_SHADER_TESS_EVAL) { - output_usage_mask = - ctx->shader_info->info.tes.output_usage_mask[VARYING_SLOT_CLIP_DIST0]; - } else { - assert(ctx->is_gs_copy_shader); - output_usage_mask = - ctx->shader_info->info.gs.output_usage_mask[VARYING_SLOT_CLIP_DIST0]; - } + for(unsigned location = VARYING_SLOT_CLIP_DIST0; location <= VARYING_SLOT_CLIP_DIST1; ++location) { + if (ctx->output_mask & (1ull << location)) { + unsigned output_usage_mask, length; + LLVMValueRef slots[4]; + unsigned j; + + if (ctx->stage == MESA_SHADER_VERTEX && + !ctx->is_gs_copy_shader) { + output_usage_mask = + ctx->shader_info->info.vs.output_usage_mask[location]; + } else if (ctx->stage == MESA_SHADER_TESS_EVAL) { + output_usage_mask = + ctx->shader_info->info.tes.output_usage_mask[location]; + } else { + assert(ctx->is_gs_copy_shader); + output_usage_mask = + ctx->shader_info->info.gs.output_usage_mask[location]; + } - length = util_last_bit(output_usage_mask); + length = util_last_bit(output_usage_mask); - i = VARYING_SLOT_CLIP_DIST0; - for (j = 0; j < length; j++) - slots[j] = ac_to_float(&ctx->ac, radv_load_output(ctx, i, j)); + for (j = 0; j < length; j++) + slots[j] = ac_to_float(&ctx->ac, radv_load_output(ctx, location, j)); - for (i = length; i < 8; i++) - slots[i] = LLVMGetUndef(ctx->ac.f32); + for (i = length; i < 4; i++) + slots[i] = LLVMGetUndef(ctx->ac.f32); - if (length > 4) { - target = V_008DFC_SQ_EXP_POS + 3; - si_llvm_init_export_args(ctx, &slots[4], 0xf, target, &args); + target = V_008DFC_SQ_EXP_POS + 2 + (location - VARYING_SLOT_CLIP_DIST0); + si_llvm_init_export_args(ctx, &slots[0], 0xf, target, &args); memcpy(&pos_args[target - V_008DFC_SQ_EXP_POS], - &args, sizeof(args)); - } + &args, sizeof(args)); - target = V_008DFC_SQ_EXP_POS + 2; - si_llvm_init_export_args(ctx, &slots[0], 0xf, target, &args); - memcpy(&pos_args[target - V_008DFC_SQ_EXP_POS], - &args, sizeof(args)); - - /* Export the clip/cull distances values to the next stage. */ - radv_export_param(ctx, param_count, &slots[0], 0xf); - outinfo->vs_output_param_offset[VARYING_SLOT_CLIP_DIST0] = param_count++; - if (length > 4) { - radv_export_param(ctx, param_count, &slots[4], 0xf); - outinfo->vs_output_param_offset[VARYING_SLOT_CLIP_DIST1] = param_count++; + /* Export the clip/cull distances values to the next stage. */ + radv_export_param(ctx, param_count, &slots[0], 0xf); + outinfo->vs_output_param_offset[location] = param_count++; } } @@ -2820,28 +2874,14 @@ handle_es_outputs_post(struct radv_shader_context *ctx, LLVMValueRef lds_base = NULL; for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { - unsigned output_usage_mask; int param_index; - int length = 4; if (!(ctx->output_mask & (1ull << i))) continue; - if (ctx->stage == MESA_SHADER_VERTEX) { - output_usage_mask = - ctx->shader_info->info.vs.output_usage_mask[i]; - } else { - assert(ctx->stage == MESA_SHADER_TESS_EVAL); - output_usage_mask = - ctx->shader_info->info.tes.output_usage_mask[i]; - } - - if (i == VARYING_SLOT_CLIP_DIST0) - length = util_last_bit(output_usage_mask); - param_index = shader_io_get_unique_index(i); - max_output_written = MAX2(param_index + (length > 4), max_output_written); + max_output_written = MAX2(param_index, max_output_written); } outinfo->esgs_itemsize = (max_output_written + 1) * 16; @@ -2862,7 +2902,6 @@ handle_es_outputs_post(struct radv_shader_context *ctx, LLVMValueRef *out_ptr = &ctx->abi.outputs[i * 4]; unsigned output_usage_mask; int param_index; - int length = 4; if (!(ctx->output_mask & (1ull << i))) continue; @@ -2876,9 +2915,6 @@ handle_es_outputs_post(struct radv_shader_context *ctx, ctx->shader_info->info.tes.output_usage_mask[i]; } - if (i == VARYING_SLOT_CLIP_DIST0) - length = util_last_bit(output_usage_mask); - param_index = shader_io_get_unique_index(i); if (lds_base) { @@ -2887,7 +2923,7 @@ handle_es_outputs_post(struct radv_shader_context *ctx, ""); } - for (j = 0; j < length; j++) { + for (j = 0; j < 4; j++) { if (!(output_usage_mask & (1 << j))) continue; @@ -2924,22 +2960,16 @@ handle_ls_outputs_post(struct radv_shader_context *ctx) vertex_dw_stride, ""); for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { - unsigned output_usage_mask = - ctx->shader_info->info.vs.output_usage_mask[i]; LLVMValueRef *out_ptr = &ctx->abi.outputs[i * 4]; - int length = 4; if (!(ctx->output_mask & (1ull << i))) continue; - if (i == VARYING_SLOT_CLIP_DIST0) - length = util_last_bit(output_usage_mask); - int param = shader_io_get_unique_index(i); LLVMValueRef dw_addr = LLVMBuildAdd(ctx->ac.builder, base_dw_addr, LLVMConstInt(ctx->ac.i32, param * 4, false), ""); - for (unsigned j = 0; j < length; j++) { + for (unsigned j = 0; j < 4; j++) { LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, out_ptr[j], ""); value = ac_to_integer(&ctx->ac, value); value = LLVMBuildZExtOrBitCast(ctx->ac.builder, value, ctx->ac.i32, ""); @@ -3467,10 +3497,17 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, ctx.abi.clamp_shadow_reference = false; ctx.abi.gfx9_stride_size_workaround = ctx.ac.chip_class == GFX9 && HAVE_LLVM < 0x800; + /* Because the new raw/struct atomic intrinsics are buggy with LLVM 8, + * we fallback to the old intrinsics for atomic buffer image operations + * and thus we need to apply the indexing workaround... + */ + ctx.abi.gfx9_stride_size_workaround_for_atomic = ctx.ac.chip_class == GFX9 && HAVE_LLVM < 0x900; + if (shader_count >= 2) ac_init_exec_full_mask(&ctx.ac); - if (ctx.ac.chip_class == GFX9 && + if ((ctx.ac.family == CHIP_VEGA10 || + ctx.ac.family == CHIP_RAVEN) && shaders[shader_count - 1]->info.stage == MESA_SHADER_TESS_CTRL) ac_nir_fixup_ls_hs_input_vgprs(&ctx); diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index 138e153f9a4..2526000f56f 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -524,6 +524,14 @@ radv_pipeline_compute_spi_color_formats(struct radv_pipeline *pipeline, col_format |= cf << (4 * i); } + if (!col_format && blend->need_src_alpha & (1 << 0)) { + /* When a subpass doesn't have any color attachments, write the + * alpha channel of MRT0 when alpha coverage is enabled because + * the depth attachment needs it. + */ + col_format |= V_028714_SPI_SHADER_32_ABGR; + } + /* If the i-th target format is set, all previous target formats must * be non-zero to avoid hangs. */ @@ -689,6 +697,7 @@ radv_pipeline_init_blend_state(struct radv_pipeline *pipeline, if (vkms && vkms->alphaToCoverageEnable) { blend.db_alpha_to_mask |= S_028B70_ALPHA_TO_MASK_ENABLE(1); + blend.need_src_alpha |= 0x1; } blend.cb_target_mask = 0; @@ -1436,11 +1445,13 @@ radv_pipeline_init_dynamic_state(struct radv_pipeline *pipeline, const VkPipelineDiscardRectangleStateCreateInfoEXT *discard_rectangle_info = vk_find_struct_const(pCreateInfo->pNext, PIPELINE_DISCARD_RECTANGLE_STATE_CREATE_INFO_EXT); - if (states & RADV_DYNAMIC_DISCARD_RECTANGLE) { + if (needed_states & RADV_DYNAMIC_DISCARD_RECTANGLE) { dynamic->discard_rectangle.count = discard_rectangle_info->discardRectangleCount; - typed_memcpy(dynamic->discard_rectangle.rectangles, - discard_rectangle_info->pDiscardRectangles, - discard_rectangle_info->discardRectangleCount); + if (states & RADV_DYNAMIC_DISCARD_RECTANGLE) { + typed_memcpy(dynamic->discard_rectangle.rectangles, + discard_rectangle_info->pDiscardRectangles, + discard_rectangle_info->discardRectangleCount); + } } pipeline->dynamic_state.mask = states; @@ -1913,6 +1924,8 @@ radv_generate_graphics_pipeline_key(struct radv_pipeline *pipeline, } key.vertex_alpha_adjust |= adjust << (2 * location); } + + key.vertex_attribute_provided |= 1 << location; } if (pCreateInfo->pTessellationState) @@ -1941,6 +1954,7 @@ radv_fill_shader_keys(struct radv_shader_variant_key *keys, { keys[MESA_SHADER_VERTEX].vs.instance_rate_inputs = key->instance_rate_inputs; keys[MESA_SHADER_VERTEX].vs.alpha_adjust = key->vertex_alpha_adjust; + keys[MESA_SHADER_VERTEX].vs.vertex_attribute_provided = key->vertex_attribute_provided; for (unsigned i = 0; i < MAX_VERTEX_ATTRIBS; ++i) keys[MESA_SHADER_VERTEX].vs.instance_rate_divisors[i] = key->instance_rate_divisors[i]; @@ -3079,13 +3093,17 @@ radv_pipeline_generate_geometry_shader(struct radeon_cmdbuf *ctx_cs, radv_pipeline_generate_hw_vs(ctx_cs, cs, pipeline, pipeline->gs_copy_shader); } -static uint32_t offset_to_ps_input(uint32_t offset, bool flat_shade) +static uint32_t offset_to_ps_input(uint32_t offset, bool flat_shade, bool float16) { uint32_t ps_input_cntl; if (offset <= AC_EXP_PARAM_OFFSET_31) { ps_input_cntl = S_028644_OFFSET(offset); if (flat_shade) ps_input_cntl |= S_028644_FLAT_SHADE(1); + if (float16) { + ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) | + S_028644_ATTR0_VALID(1); + } } else { /* The input is a DEFAULT_VAL constant. */ assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 && @@ -3110,7 +3128,7 @@ radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf *ctx_cs, if (ps->info.info.ps.prim_id_input) { unsigned vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID]; if (vs_offset != AC_EXP_PARAM_UNDEFINED) { - ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true); + ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true, false); ++ps_offset; } } @@ -3120,9 +3138,9 @@ radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf *ctx_cs, ps->info.info.needs_multiview_view_index) { unsigned vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_LAYER]; if (vs_offset != AC_EXP_PARAM_UNDEFINED) - ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true); + ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true, false); else - ps_input_cntl[ps_offset] = offset_to_ps_input(AC_EXP_PARAM_DEFAULT_VAL_0000, true); + ps_input_cntl[ps_offset] = offset_to_ps_input(AC_EXP_PARAM_DEFAULT_VAL_0000, true, false); ++ps_offset; } @@ -3138,14 +3156,14 @@ radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf *ctx_cs, vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_CLIP_DIST0]; if (vs_offset != AC_EXP_PARAM_UNDEFINED) { - ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, false); + ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, false, false); ++ps_offset; } vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_CLIP_DIST1]; if (vs_offset != AC_EXP_PARAM_UNDEFINED && ps->info.info.ps.num_input_clips_culls > 4) { - ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, false); + ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, false, false); ++ps_offset; } } @@ -3153,6 +3171,7 @@ radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf *ctx_cs, for (unsigned i = 0; i < 32 && (1u << i) <= ps->info.fs.input_mask; ++i) { unsigned vs_offset; bool flat_shade; + bool float16; if (!(ps->info.fs.input_mask & (1u << i))) continue; @@ -3164,8 +3183,9 @@ radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf *ctx_cs, } flat_shade = !!(ps->info.fs.flat_shaded_mask & (1u << ps_offset)); + float16 = !!(ps->info.fs.float16_shaded_mask & (1u << ps_offset)); - ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, flat_shade); + ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, flat_shade, float16); ++ps_offset; } @@ -3192,11 +3212,11 @@ radv_compute_db_shader_control(const struct radv_device *device, bool disable_rbplus = device->physical_device->has_rbplus && !device->physical_device->rbplus_allowed; - /* Do not enable the gl_SampleMask fragment shader output if MSAA is - * disabled. + /* It shouldn't be needed to export gl_SampleMask when MSAA is disabled + * but this appears to break Project Cars (DXVK). See + * https://bugs.freedesktop.org/show_bug.cgi?id=109401 */ - bool mask_export_enable = ms->num_samples > 1 && - ps->info.info.ps.writes_sample_mask; + bool mask_export_enable = ps->info.info.ps.writes_sample_mask; return S_02880C_Z_EXPORT_ENABLE(ps->info.info.ps.writes_z) | S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(ps->info.info.ps.writes_stencil) | diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 85c18906f84..ea957ae6dab 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -306,6 +306,9 @@ struct radv_physical_device { /* Whether DCC should be enabled for MSAA textures. */ bool dcc_msaa_allowed; + /* Whether LOAD_CONTEXT_REG packets are supported. */ + bool has_load_ctx_reg_pkt; + /* This is the drivers on-disk cache used as a fallback as opposed to * the pipeline cache defined by apps. */ @@ -362,6 +365,7 @@ struct radv_pipeline_cache { struct radv_pipeline_key { uint32_t instance_rate_inputs; uint32_t instance_rate_divisors[MAX_VERTEX_ATTRIBS]; + uint32_t vertex_attribute_provided; uint64_t vertex_alpha_adjust; unsigned tess_input_vertices; uint32_t col_format; @@ -1144,6 +1148,7 @@ void si_write_scissors(struct radeon_cmdbuf *cs, int first, const VkViewport *viewports, bool can_use_guardband); uint32_t si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer, bool instanced_draw, bool indirect_draw, + bool count_from_stream_output, uint32_t draw_vertex_count); void si_cs_emit_write_event_eop(struct radeon_cmdbuf *cs, enum chip_class chip_class, @@ -1462,6 +1467,7 @@ bool radv_format_pack_clear_color(VkFormat format, bool radv_is_colorbuffer_format_supported(VkFormat format, bool *blendable); bool radv_dcc_formats_compatible(VkFormat format1, VkFormat format2); +bool radv_device_supports_etc(struct radv_physical_device *physical_device); struct radv_fmask_info { uint64_t offset; diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index 32cd9ae25e9..ec571e2f8c5 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -159,7 +159,7 @@ radv_optimize_nir(struct nir_shader *shader, bool optimize_conservatively, NIR_PASS(progress, shader, nir_opt_if); NIR_PASS(progress, shader, nir_opt_dead_cf); NIR_PASS(progress, shader, nir_opt_cse); - NIR_PASS(progress, shader, nir_opt_peephole_select, 8, true, true); + NIR_PASS(progress, shader, nir_opt_peephole_select, 8, true); NIR_PASS(progress, shader, nir_opt_algebraic); NIR_PASS(progress, shader, nir_opt_constant_folding); NIR_PASS(progress, shader, nir_opt_undef); @@ -222,6 +222,8 @@ radv_shader_compile_to_nir(struct radv_device *device, .lower_ubo_ssbo_access_to_offsets = true, .caps = { .descriptor_array_dynamic_indexing = true, + .descriptor_array_non_uniform_indexing = true, + .descriptor_indexing = true, .device_group = true, .draw_parameters = true, .float64 = true, @@ -610,6 +612,8 @@ shader_variant_create(struct radv_device *device, tm_options |= AC_TM_SISCHED; if (options->check_ir) tm_options |= AC_TM_CHECK_IR; + if (device->instance->debug_flags & RADV_DEBUG_NO_LOAD_STORE_OPT) + tm_options |= AC_TM_NO_LOAD_STORE_OPT; thread_compiler = !(device->instance->debug_flags & RADV_DEBUG_NOTHREADLLVM); radv_init_llvm_once(); diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index 3652a811e80..f6f9dd2bbf1 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -66,6 +66,9 @@ struct radv_vs_variant_key { uint32_t instance_rate_inputs; uint32_t instance_rate_divisors[MAX_VERTEX_ATTRIBS]; + /* Mask of vertex attributes that are provided by the pipeline. */ + uint32_t vertex_attribute_provided; + /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW. * so we may need to fix it up. */ uint64_t alpha_adjust; @@ -257,6 +260,7 @@ struct radv_shader_variant_info { unsigned num_interp; uint32_t input_mask; uint32_t flat_shaded_mask; + uint32_t float16_shaded_mask; bool can_discard; bool early_fragment_test; } fs; @@ -401,6 +405,8 @@ static inline unsigned shader_io_get_unique_index(gl_varying_slot slot) return 1; if (slot == VARYING_SLOT_CLIP_DIST0) return 2; + if (slot == VARYING_SLOT_CLIP_DIST1) + return 3; /* 3 is reserved for clip dist as well */ if (slot >= VARYING_SLOT_VAR0 && slot <= VARYING_SLOT_VAR31) return 4 + (slot - VARYING_SLOT_VAR0); diff --git a/src/amd/vulkan/radv_shader_info.c b/src/amd/vulkan/radv_shader_info.c index 7e5a3789af2..fdc4f52086b 100644 --- a/src/amd/vulkan/radv_shader_info.c +++ b/src/amd/vulkan/radv_shader_info.c @@ -101,7 +101,7 @@ gather_intrinsic_load_deref_info(const nir_shader *nir, case MESA_SHADER_VERTEX: { nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr)); - if (var->data.mode == nir_var_shader_in) { + if (var && var->data.mode == nir_var_shader_in) { unsigned idx = var->data.location; uint8_t mask = nir_ssa_def_components_read(&instr->dest.ssa); @@ -115,6 +115,15 @@ gather_intrinsic_load_deref_info(const nir_shader *nir, } } +static uint32_t +widen_writemask(uint32_t wrmask) +{ + uint32_t new_wrmask = 0; + for(unsigned i = 0; i < 4; i++) + new_wrmask |= (wrmask & (1 << i) ? 0x3 : 0x0) << (i * 2); + return new_wrmask; +} + static void set_output_usage_mask(const nir_shader *nir, const nir_intrinsic_instr *instr, uint8_t *output_usage_mask) @@ -122,25 +131,27 @@ set_output_usage_mask(const nir_shader *nir, const nir_intrinsic_instr *instr, nir_deref_instr *deref_instr = nir_instr_as_deref(instr->src[0].ssa->parent_instr); nir_variable *var = nir_deref_instr_get_variable(deref_instr); - unsigned attrib_count = glsl_count_attribute_slots(var->type, false); + unsigned attrib_count = glsl_count_attribute_slots(deref_instr->type, false); unsigned idx = var->data.location; unsigned comp = var->data.location_frac; unsigned const_offset = 0; get_deref_offset(deref_instr, &const_offset); - if (idx == VARYING_SLOT_CLIP_DIST0) { - /* Special case for clip/cull distances because there are - * combined into a single array that contains both. - */ - output_usage_mask[idx] |= 1 << const_offset; + if (var->data.compact) { + assert(!glsl_type_is_64bit(deref_instr->type)); + const_offset += comp; + output_usage_mask[idx + const_offset / 4] |= 1 << (const_offset % 4); return; } - for (unsigned i = 0; i < attrib_count; i++) { + uint32_t wrmask = nir_intrinsic_write_mask(instr); + if (glsl_type_is_64bit(deref_instr->type)) + wrmask = widen_writemask(wrmask); + + for (unsigned i = 0; i < attrib_count; i++) output_usage_mask[idx + i + const_offset] |= - instr->const_index[0] << comp; - } + ((wrmask >> (i * 4)) & 0xf) << comp; } static void @@ -150,7 +161,7 @@ gather_intrinsic_store_deref_info(const nir_shader *nir, { nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr)); - if (var->data.mode == nir_var_shader_out) { + if (var && var->data.mode == nir_var_shader_out) { unsigned idx = var->data.location; switch (nir->info.stage) { @@ -174,13 +185,9 @@ gather_intrinsic_store_deref_info(const nir_shader *nir, type = glsl_get_array_element(var->type); unsigned slots = - var->data.compact ? DIV_ROUND_UP(glsl_get_length(type), 4) + var->data.compact ? DIV_ROUND_UP(var->data.location_frac + glsl_get_length(type), 4) : glsl_count_attribute_slots(type, false); - if (idx == VARYING_SLOT_CLIP_DIST0) - slots = (nir->info.clip_distance_array_size + - nir->info.cull_distance_array_size > 4) ? 2 : 1; - mark_tess_output(info, var->data.patch, param, slots); break; } @@ -374,7 +381,8 @@ gather_info_input_decl_ps(const nir_shader *nir, const nir_variable *var, info->ps.layer_input = true; break; case VARYING_SLOT_CLIP_DIST0: - info->ps.num_input_clips_culls = attrib_count; + case VARYING_SLOT_CLIP_DIST1: + info->ps.num_input_clips_culls += attrib_count; break; default: break; @@ -409,8 +417,8 @@ gather_info_output_decl_ls(const nir_shader *nir, const nir_variable *var, int idx = var->data.location; unsigned param = shader_io_get_unique_index(idx); int num_slots = glsl_count_attribute_slots(var->type, false); - if (idx == VARYING_SLOT_CLIP_DIST0) - num_slots = (nir->info.clip_distance_array_size + nir->info.cull_distance_array_size > 4) ? 2 : 1; + if (var->data.compact) + num_slots = DIV_ROUND_UP(var->data.location_frac + glsl_get_length(var->type), 4); mark_ls_output(info, param, num_slots); } diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c index e75c6d127d6..e73c13762e5 100644 --- a/src/amd/vulkan/si_cmd_buffer.c +++ b/src/amd/vulkan/si_cmd_buffer.c @@ -561,6 +561,7 @@ radv_prims_for_vertices(struct radv_prim_vertex_count *info, unsigned num) uint32_t si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer, bool instanced_draw, bool indirect_draw, + bool count_from_stream_output, uint32_t draw_vertex_count) { enum chip_class chip_class = cmd_buffer->device->physical_device->rad_info.chip_class; @@ -622,6 +623,12 @@ si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer, (instanced_draw || indirect_draw)) partial_vs_wave = true; + /* Hardware requirement when drawing primitives from a stream + * output buffer. + */ + if (count_from_stream_output) + wd_switch_on_eop = true; + /* If the WD switch is false, the IA switch must be false too. */ assert(wd_switch_on_eop || !ia_switch_on_eop); } diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c index d3b1e2cd4c6..49a86a72c31 100644 --- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c @@ -543,7 +543,7 @@ static void radv_amdgpu_cs_add_buffer_internal(struct radv_amdgpu_cs *cs, cs->handles[cs->num_buffers].bo_handle = bo; cs->handles[cs->num_buffers].bo_priority = priority; - hash = ((uintptr_t)bo >> 6) & (ARRAY_SIZE(cs->buffer_hash_table) - 1); + hash = bo & (ARRAY_SIZE(cs->buffer_hash_table) - 1); cs->buffer_hash_table[hash] = cs->num_buffers; ++cs->num_buffers; @@ -665,6 +665,7 @@ static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws, assert(num < ws->num_buffers); handles[num].bo_handle = bo->bo_handle; handles[num].bo_priority = bo->priority; + num++; } r = amdgpu_bo_list_create_raw(ws->dev, ws->num_buffers, diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys_public.h b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys_public.h index 854e216551f..709669b2a57 100644 --- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys_public.h +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys_public.h @@ -29,6 +29,13 @@ #ifndef RADV_AMDGPU_WINSYS_PUBLIC_H #define RADV_AMDGPU_WINSYS_PUBLIC_H +/* The number of IBs per submit isn't infinite, it depends on the ring type + * (ie. some initial setup needed for a submit) and the number of IBs (4 DW). + * This limit is arbitrary but should be safe for now. Ideally, we should get + * this limit from the KMD. +*/ +#define RADV_MAX_IBS_PER_SUBMIT 192 + struct radeon_winsys *radv_amdgpu_winsys_create(int fd, uint64_t debug_flags, uint64_t perftest_flags); diff --git a/src/broadcom/cle/v3d_packet_v33.xml b/src/broadcom/cle/v3d_packet_v33.xml index 754461dc067..06e8ddad7ec 100644 --- a/src/broadcom/cle/v3d_packet_v33.xml +++ b/src/broadcom/cle/v3d_packet_v33.xml @@ -820,8 +820,8 @@ - - + + diff --git a/src/broadcom/common/v3d_cpu_tiling.h b/src/broadcom/common/v3d_cpu_tiling.h index e10b4586609..cb1ee7c96f4 100644 --- a/src/broadcom/common/v3d_cpu_tiling.h +++ b/src/broadcom/common/v3d_cpu_tiling.h @@ -159,9 +159,8 @@ v3d_store_utile(void *gpu, uint32_t gpu_stride, * d0-d7. */ "vstm %[gpu], {q0, q1, q2, q3}\n" - : + : [cpu] "+r"(cpu) : [gpu] "r"(gpu), - [cpu] "r"(cpu), [cpu_stride] "r"(cpu_stride) : "q0", "q1", "q2", "q3"); return; diff --git a/src/broadcom/common/v3d_limits.h b/src/broadcom/common/v3d_limits.h index ee7a3e6bc00..e21ee246eff 100644 --- a/src/broadcom/common/v3d_limits.h +++ b/src/broadcom/common/v3d_limits.h @@ -32,7 +32,8 @@ */ #define V3D_MAX_TEXTURE_SAMPLERS 16 -#define V3D_MAX_MIP_LEVELS 12 +/* The HW can do 16384 (15), but we run into hangs when we expose that. */ +#define V3D_MAX_MIP_LEVELS 13 #define V3D_MAX_SAMPLES 4 diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c index a5e75f650e8..bd19bb9b0b6 100644 --- a/src/broadcom/compiler/nir_to_vir.c +++ b/src/broadcom/compiler/nir_to_vir.c @@ -121,7 +121,7 @@ vir_emit_thrsw(struct v3d_compile *c) */ c->last_thrsw = vir_NOP(c); c->last_thrsw->qpu.sig.thrsw = true; - c->last_thrsw_at_top_level = (c->execute.file == QFILE_NULL); + c->last_thrsw_at_top_level = !c->in_control_flow; } static uint32_t @@ -1158,7 +1158,9 @@ emit_frag_end(struct v3d_compile *c) inst->src[vir_get_implicit_uniform_src(inst)] = vir_uniform_ui(c, tlb_specifier | 0xffffff00); + c->writes_z = true; } else if (c->s->info.fs.uses_discard || + !c->s->info.fs.early_fragment_tests || c->fs_key->sample_alpha_to_coverage || !has_any_tlb_color_write) { /* Emit passthrough Z if it needed to be delayed until shader @@ -1188,6 +1190,7 @@ emit_frag_end(struct v3d_compile *c) inst->src[vir_get_implicit_uniform_src(inst)] = vir_uniform_ui(c, tlb_specifier | 0xffffff00); + c->writes_z = true; } /* XXX: Performance improvement: Merge Z write and color writes TLB @@ -1455,7 +1458,7 @@ v3d_optimize_nir(struct nir_shader *s) NIR_PASS(progress, s, nir_opt_dce); NIR_PASS(progress, s, nir_opt_dead_cf); NIR_PASS(progress, s, nir_opt_cse); - NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true); + NIR_PASS(progress, s, nir_opt_peephole_select, 8, true); NIR_PASS(progress, s, nir_opt_algebraic); NIR_PASS(progress, s, nir_opt_constant_folding); NIR_PASS(progress, s, nir_opt_undef); @@ -2103,10 +2106,10 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt) else else_block = vir_new_block(c); - bool was_top_level = false; + bool was_uniform_control_flow = false; if (c->execute.file == QFILE_NULL) { c->execute = vir_MOV(c, vir_uniform_ui(c, 0)); - was_top_level = true; + was_uniform_control_flow = true; } /* Set up the flags for the IF condition (taking the THEN branch). */ @@ -2122,7 +2125,7 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt) /* Update the flags+cond to mean "Taking the ELSE branch (!cond) and * was previously active (execute Z) for updating the exec flags. */ - if (was_top_level) { + if (was_uniform_control_flow) { cond = v3d_qpu_cond_invert(cond); } else { struct qinst *inst = vir_MOV_dest(c, vir_reg(QFILE_NULL, 0), @@ -2176,7 +2179,7 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt) vir_link_blocks(c->cur_block, after_block); vir_set_emit_block(c, after_block); - if (was_top_level) + if (was_uniform_control_flow) c->execute = c->undef; else ntq_activate_execute_for_block(c); @@ -2185,12 +2188,15 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt) static void ntq_emit_if(struct v3d_compile *c, nir_if *nif) { + bool was_in_control_flow = c->in_control_flow; + c->in_control_flow = true; if (c->execute.file == QFILE_NULL && nir_src_is_dynamically_uniform(nif->condition)) { ntq_emit_uniform_if(c, nif); } else { ntq_emit_nonuniform_if(c, nif); } + c->in_control_flow = was_in_control_flow; } static void @@ -2267,10 +2273,13 @@ static void ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list); static void ntq_emit_loop(struct v3d_compile *c, nir_loop *loop) { - bool was_top_level = false; + bool was_in_control_flow = c->in_control_flow; + c->in_control_flow = true; + + bool was_uniform_control_flow = false; if (c->execute.file == QFILE_NULL) { c->execute = vir_MOV(c, vir_uniform_ui(c, 0)); - was_top_level = true; + was_uniform_control_flow = true; } struct qblock *save_loop_cont_block = c->loop_cont_block; @@ -2307,7 +2316,7 @@ ntq_emit_loop(struct v3d_compile *c, nir_loop *loop) vir_link_blocks(c->cur_block, c->loop_break_block); vir_set_emit_block(c, c->loop_break_block); - if (was_top_level) + if (was_uniform_control_flow) c->execute = c->undef; else ntq_activate_execute_for_block(c); @@ -2316,6 +2325,8 @@ ntq_emit_loop(struct v3d_compile *c, nir_loop *loop) c->loop_cont_block = save_loop_cont_block; c->loops++; + + c->in_control_flow = was_in_control_flow; } static void diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h index 127b04136d1..671aba3c551 100644 --- a/src/broadcom/compiler/v3d_compiler.h +++ b/src/broadcom/compiler/v3d_compiler.h @@ -519,6 +519,7 @@ struct v3d_compile { uint32_t centroid_flags[BITSET_WORDS(V3D_MAX_FS_INPUTS)]; bool uses_center_w; + bool writes_z; struct v3d_ubo_range *ubo_ranges; bool *ubo_range_used; @@ -531,6 +532,7 @@ struct v3d_compile { * yes, otherwise a block number + 1 that the channel jumped to. */ struct qreg execute; + bool in_control_flow; struct qreg line_x, point_x, point_y; @@ -716,7 +718,7 @@ struct v3d_fs_prog_data { uint32_t centroid_flags[((V3D_MAX_FS_INPUTS - 1) / 24) + 1]; bool writes_z; - bool discard; + bool disable_ez; bool uses_center_w; }; diff --git a/src/broadcom/compiler/v3d_nir_lower_image_load_store.c b/src/broadcom/compiler/v3d_nir_lower_image_load_store.c index e74206b3949..2aa3cbad495 100644 --- a/src/broadcom/compiler/v3d_nir_lower_image_load_store.c +++ b/src/broadcom/compiler/v3d_nir_lower_image_load_store.c @@ -156,7 +156,7 @@ pack_sint(nir_builder *b, nir_ssa_def *color, const unsigned *bits, int num_components) { color = nir_channels(b, color, (1 << num_components) - 1); - color = nir_format_clamp_uint(b, color, bits); + color = nir_format_clamp_sint(b, color, bits); return pack_bits(b, color, bits, num_components, true); } diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c index 10105fbd861..20f7004149c 100644 --- a/src/broadcom/compiler/vir.c +++ b/src/broadcom/compiler/vir.c @@ -777,21 +777,9 @@ v3d_fs_set_prog_data(struct v3d_compile *c, struct v3d_fs_prog_data *prog_data) { v3d_set_fs_prog_data_inputs(c, prog_data); - prog_data->writes_z = (c->s->info.outputs_written & - (1 << FRAG_RESULT_DEPTH)); - prog_data->discard = (c->s->info.fs.uses_discard || - c->fs_key->sample_alpha_to_coverage); + prog_data->writes_z = c->writes_z; + prog_data->disable_ez = !c->s->info.fs.early_fragment_tests; prog_data->uses_center_w = c->uses_center_w; - - /* If the shader has some side effects and hasn't allowed early - * fragment tests, disable them. - */ - if (!c->s->info.fs.early_fragment_tests && - (c->s->info.num_images || - c->s->info.num_ssbos || - c->s->info.num_abos)) { - prog_data->discard = true; - } } static void @@ -888,6 +876,15 @@ v3d_nir_lower_fs_early(struct v3d_compile *c) { if (c->fs_key->int_color_rb || c->fs_key->uint_color_rb) v3d_fixup_fs_output_types(c); + + /* If the shader has no non-TLB side effects, we can promote it to + * enabling early_fragment_tests even if the user didn't. + */ + if (!(c->s->info.num_images || + c->s->info.num_ssbos || + c->s->info.num_abos)) { + c->s->info.fs.early_fragment_tests = true; + } } static void diff --git a/src/compiler/Android.glsl.gen.mk b/src/compiler/Android.glsl.gen.mk index e31eb6f101f..3b94ea7bd2f 100644 --- a/src/compiler/Android.glsl.gen.mk +++ b/src/compiler/Android.glsl.gen.mk @@ -104,6 +104,6 @@ $(intermediates)/glsl/ir_expression_operation_strings.h: $(LOCAL_PATH)/glsl/ir_e @mkdir -p $(dir $@) $(hide) $(MESA_PYTHON2) $< strings > $@ -$(intermediates)/compiler/glsl/float64_glsl.h: $(LOCAL_PATH)/glsl/xxd.py +$(intermediates)/glsl/float64_glsl.h: $(LOCAL_PATH)/glsl/xxd.py @mkdir -p $(dir $@) $(hide) $(MESA_PYTHON2) $< $(MESA_TOP)/src/compiler/glsl/float64.glsl $@ -n float64_source > $@ diff --git a/src/compiler/Android.glsl.mk b/src/compiler/Android.glsl.mk index 0aabafa2673..37b3cb80251 100644 --- a/src/compiler/Android.glsl.mk +++ b/src/compiler/Android.glsl.mk @@ -48,7 +48,7 @@ LOCAL_STATIC_LIBRARIES := \ libmesa_nir LOCAL_MODULE := libmesa_glsl - +LOCAL_CFLAGS += -Wno-error include $(LOCAL_PATH)/Android.glsl.gen.mk include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) diff --git a/src/compiler/Android.nir.mk b/src/compiler/Android.nir.mk index 75a247a245d..59da5dbdc1c 100644 --- a/src/compiler/Android.nir.mk +++ b/src/compiler/Android.nir.mk @@ -41,6 +41,9 @@ LOCAL_C_INCLUDES := \ $(MESA_TOP)/src/gallium/include \ $(MESA_TOP)/src/gallium/auxiliary +LOCAL_CFLAGS := \ + -Wno-missing-braces + LOCAL_STATIC_LIBRARIES := libmesa_compiler LOCAL_MODULE := libmesa_nir diff --git a/src/compiler/Makefile.sources b/src/compiler/Makefile.sources index 489022a22a1..0b40c3c6ebe 100644 --- a/src/compiler/Makefile.sources +++ b/src/compiler/Makefile.sources @@ -229,6 +229,7 @@ NIR_FILES = \ nir/nir_lower_alpha_test.c \ nir/nir_lower_alu.c \ nir/nir_lower_alu_to_scalar.c \ + nir/nir_lower_array_deref_of_vec.c \ nir/nir_lower_atomics_to_ssbo.c \ nir/nir_lower_bitmap.c \ nir/nir_lower_bit_size.c \ @@ -251,6 +252,7 @@ NIR_FILES = \ nir/nir_lower_io_arrays_to_elements.c \ nir/nir_lower_io_to_temporaries.c \ nir/nir_lower_io_to_scalar.c \ + nir/nir_lower_io_to_vector.c \ nir/nir_lower_packing.c \ nir/nir_lower_passthrough_edgeflags.c \ nir/nir_lower_patch_vertices.c \ diff --git a/src/compiler/glsl/ast_to_hir.cpp b/src/compiler/glsl/ast_to_hir.cpp index 620153e6a34..8c707265e44 100644 --- a/src/compiler/glsl/ast_to_hir.cpp +++ b/src/compiler/glsl/ast_to_hir.cpp @@ -3698,6 +3698,10 @@ apply_layout_qualifier_to_variable(const struct ast_type_qualifier *qual, "cannot be applied to a matrix, a structure, " "a block, or an array containing any of " "these."); + } else if (components > 4 && type->is_64bit()) { + _mesa_glsl_error(loc, state, "component layout qualifier " + "cannot be applied to dvec%u.", + components / 2); } else if (qual_component != 0 && (qual_component + components - 1) > 3) { _mesa_glsl_error(loc, state, "component overflow (%u > 3)", @@ -3940,7 +3944,8 @@ apply_type_qualifier_to_variable(const struct ast_type_qualifier *qual, "`invariant' after being used", var->name); } else { - var->data.invariant = 1; + var->data.explicit_invariant = true; + var->data.invariant = true; } } @@ -4148,8 +4153,10 @@ apply_type_qualifier_to_variable(const struct ast_type_qualifier *qual, } } - if (state->all_invariant && var->data.mode == ir_var_shader_out) + if (state->all_invariant && var->data.mode == ir_var_shader_out) { + var->data.explicit_invariant = true; var->data.invariant = true; + } var->data.interpolation = interpret_interpolation_qualifier(qual, var->type, @@ -4857,6 +4864,7 @@ ast_declarator_list::hir(exec_list *instructions, "`invariant' after being used", earlier->name); } else { + earlier->data.explicit_invariant = true; earlier->data.invariant = true; } } diff --git a/src/compiler/glsl/gl_nir_lower_samplers_as_deref.c b/src/compiler/glsl/gl_nir_lower_samplers_as_deref.c index 719968a6671..87718112db7 100644 --- a/src/compiler/glsl/gl_nir_lower_samplers_as_deref.c +++ b/src/compiler/glsl/gl_nir_lower_samplers_as_deref.c @@ -147,10 +147,20 @@ lower_deref(nir_builder *b, struct lower_samplers_as_deref_state *state, remove_struct_derefs_prep(path.path, &name, &location, &type); - assert(location < state->shader_program->data->NumUniformStorage && - state->shader_program->data->UniformStorage[location].opaque[stage].active); + if (state->shader_program && var->data.how_declared != nir_var_hidden) { + /* For GLSL programs, look up the bindings in the uniform storage. */ + assert(location < state->shader_program->data->NumUniformStorage && + state->shader_program->data->UniformStorage[location].opaque[stage].active); - binding = state->shader_program->data->UniformStorage[location].opaque[stage].index; + binding = state->shader_program->data->UniformStorage[location].opaque[stage].index; + } else { + /* For ARB programs, built-in shaders, or internally generated sampler + * variables in GLSL programs, assume that whoever created the shader + * set the bindings correctly already. + */ + assert(var->data.explicit_binding); + binding = var->data.binding; + } if (var->type == type) { /* Fast path: We did not encounter any struct derefs. */ @@ -167,6 +177,14 @@ lower_deref(nir_builder *b, struct lower_samplers_as_deref_state *state, } else { var = nir_variable_create(state->shader, nir_var_uniform, type, name); var->data.binding = binding; + + /* Don't set var->data.location. The old structure location could be + * used to index into gl_uniform_storage, assuming the full structure + * was walked in order. With the new split variables, this invariant + * no longer holds and there's no meaningful way to start from a base + * location and access a particular array element. Just leave it 0. + */ + _mesa_hash_table_insert_pre_hashed(state->remap_table, hash, name, var); } diff --git a/src/compiler/glsl/glcpp/glcpp-parse.y b/src/compiler/glsl/glcpp/glcpp-parse.y index 1c095cb66f9..c951d9526ac 100644 --- a/src/compiler/glsl/glcpp/glcpp-parse.y +++ b/src/compiler/glsl/glcpp/glcpp-parse.y @@ -224,10 +224,12 @@ expanded_line: glcpp_error(& @1, parser, "undefined macro %s in expression (illegal in GLES)", $2.undefined_macro); _glcpp_parser_skip_stack_change_if (parser, & @1, "elif", $2.value); } -| LINE_EXPANDED integer_constant NEWLINE { +| LINE_EXPANDED expression NEWLINE { + if (parser->is_gles && $2.undefined_macro) + glcpp_error(& @1, parser, "undefined macro %s in expression (illegal in GLES)", $2.undefined_macro); parser->has_new_line_number = 1; - parser->new_line_number = $2; - _mesa_string_buffer_printf(parser->output, "#line %" PRIiMAX "\n", $2); + parser->new_line_number = $2.value; + _mesa_string_buffer_printf(parser->output, "#line %" PRIiMAX "\n", $2.value); } | LINE_EXPANDED integer_constant integer_constant NEWLINE { parser->has_new_line_number = 1; @@ -238,6 +240,17 @@ expanded_line: "#line %" PRIiMAX " %" PRIiMAX "\n", $2, $3); } +| LINE_EXPANDED '(' expression ')' '(' expression ')' NEWLINE { + if (parser->is_gles && $3.undefined_macro) + glcpp_error(& @1, parser, "undefined macro %s in expression (illegal in GLES)", $3.undefined_macro); + if (parser->is_gles && $6.undefined_macro) + glcpp_error(& @1, parser, "undefined macro %s in expression (illegal in GLES)", $6.undefined_macro); + parser->has_new_line_number = 1; + parser->new_line_number = $3.value; + parser->has_new_source_number = 1; + parser->new_source_number = $6.value; + _mesa_string_buffer_printf(parser->output, "#line %" PRIiMAX " %" PRIiMAX "\n", $3.value, $6.value); + } ; define: diff --git a/src/compiler/glsl/glsl_to_nir.cpp b/src/compiler/glsl/glsl_to_nir.cpp index d2db0f95aca..47fc2fea160 100644 --- a/src/compiler/glsl/glsl_to_nir.cpp +++ b/src/compiler/glsl/glsl_to_nir.cpp @@ -353,6 +353,12 @@ nir_visitor::visit(ir_variable *ir) ir->data.location == VARYING_SLOT_TESS_LEVEL_OUTER)) { var->data.compact = ir->type->without_array()->is_scalar(); } + + if (shader->info.stage > MESA_SHADER_VERTEX && + ir->data.location >= VARYING_SLOT_CLIP_DIST0 && + ir->data.location <= VARYING_SLOT_CULL_DIST1) { + var->data.compact = ir->type->without_array()->is_scalar(); + } } break; @@ -363,6 +369,12 @@ nir_visitor::visit(ir_variable *ir) ir->data.location == VARYING_SLOT_TESS_LEVEL_OUTER)) { var->data.compact = ir->type->without_array()->is_scalar(); } + + if (shader->info.stage <= MESA_SHADER_GEOMETRY && + ir->data.location >= VARYING_SLOT_CLIP_DIST0 && + ir->data.location <= VARYING_SLOT_CULL_DIST1) { + var->data.compact = ir->type->without_array()->is_scalar(); + } break; case ir_var_uniform: diff --git a/src/compiler/glsl/ir.cpp b/src/compiler/glsl/ir.cpp index 1d1a56ae9a5..f5aa1be4e20 100644 --- a/src/compiler/glsl/ir.cpp +++ b/src/compiler/glsl/ir.cpp @@ -1734,6 +1734,7 @@ ir_variable::ir_variable(const struct glsl_type *type, const char *name, this->data.centroid = false; this->data.sample = false; this->data.patch = false; + this->data.explicit_invariant = false; this->data.invariant = false; this->data.how_declared = ir_var_declared_normally; this->data.mode = mode; diff --git a/src/compiler/glsl/ir.h b/src/compiler/glsl/ir.h index d05d1998a50..8b32ed8209a 100644 --- a/src/compiler/glsl/ir.h +++ b/src/compiler/glsl/ir.h @@ -233,7 +233,7 @@ class ir_rvalue : public ir_instruction { ir_rvalue *as_rvalue_to_saturate(); - virtual bool is_lvalue(const struct _mesa_glsl_parse_state *state = NULL) const + virtual bool is_lvalue(const struct _mesa_glsl_parse_state * = NULL) const { return false; } @@ -657,6 +657,19 @@ class ir_variable : public ir_instruction { unsigned centroid:1; unsigned sample:1; unsigned patch:1; + /** + * Was an 'invariant' qualifier explicitly set in the shader? + * + * This is used to cross validate qualifiers. + */ + unsigned explicit_invariant:1; + /** + * Is the variable invariant? + * + * It can happen either by having the 'invariant' qualifier + * explicitly set in the shader or by being used in calculations + * of other invariant variables. + */ unsigned invariant:1; unsigned precise:1; diff --git a/src/compiler/glsl/ir_print_visitor.cpp b/src/compiler/glsl/ir_print_visitor.cpp index ef6bca1229e..b055d25d60d 100644 --- a/src/compiler/glsl/ir_print_visitor.cpp +++ b/src/compiler/glsl/ir_print_visitor.cpp @@ -199,6 +199,7 @@ void ir_print_visitor::visit(ir_variable *ir) const char *const samp = (ir->data.sample) ? "sample " : ""; const char *const patc = (ir->data.patch) ? "patch " : ""; const char *const inv = (ir->data.invariant) ? "invariant " : ""; + const char *const explicit_inv = (ir->data.explicit_invariant) ? "explicit_invariant " : ""; const char *const prec = (ir->data.precise) ? "precise " : ""; const char *const bindless = (ir->data.bindless) ? "bindless " : ""; const char *const bound = (ir->data.bound) ? "bound " : ""; @@ -215,11 +216,11 @@ void ir_print_visitor::visit(ir_variable *ir) const char *const interp[] = { "", "smooth", "flat", "noperspective" }; STATIC_ASSERT(ARRAY_SIZE(interp) == INTERP_MODE_COUNT); - fprintf(f, "(%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s) ", + fprintf(f, "(%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s) ", binding, loc, component, cent, bindless, bound, image_format, memory_read_only, memory_write_only, memory_coherent, memory_volatile, memory_restrict, - samp, patc, inv, prec, mode[ir->data.mode], + samp, patc, inv, explicit_inv, prec, mode[ir->data.mode], stream, interp[ir->data.interpolation]); diff --git a/src/compiler/glsl/ir_reader.cpp b/src/compiler/glsl/ir_reader.cpp index b87933ba511..d4f0e58b155 100644 --- a/src/compiler/glsl/ir_reader.cpp +++ b/src/compiler/glsl/ir_reader.cpp @@ -419,8 +419,10 @@ ir_reader::read_declaration(s_expression *expr) var->data.sample = 1; } else if (strcmp(qualifier->value(), "patch") == 0) { var->data.patch = 1; + } else if (strcmp(qualifier->value(), "explicit_invariant") == 0) { + var->data.explicit_invariant = true; } else if (strcmp(qualifier->value(), "invariant") == 0) { - var->data.invariant = 1; + var->data.invariant = true; } else if (strcmp(qualifier->value(), "uniform") == 0) { var->data.mode = ir_var_uniform; } else if (strcmp(qualifier->value(), "shader_storage") == 0) { diff --git a/src/compiler/glsl/link_uniform_block_active_visitor.cpp b/src/compiler/glsl/link_uniform_block_active_visitor.cpp index 368981852c0..0af3b312071 100644 --- a/src/compiler/glsl/link_uniform_block_active_visitor.cpp +++ b/src/compiler/glsl/link_uniform_block_active_visitor.cpp @@ -103,6 +103,7 @@ process_arrays(void *mem_ctx, ir_dereference_array *ir, if (*ub_array_ptr == NULL) { *ub_array_ptr = rzalloc(mem_ctx, struct uniform_block_array_elements); (*ub_array_ptr)->ir = ir; + (*ub_array_ptr)->original_dim_size = block->type->length; } struct uniform_block_array_elements *ub_array = *ub_array_ptr; diff --git a/src/compiler/glsl/link_uniform_block_active_visitor.h b/src/compiler/glsl/link_uniform_block_active_visitor.h index fbac65d5b67..a8ea3f52b6d 100644 --- a/src/compiler/glsl/link_uniform_block_active_visitor.h +++ b/src/compiler/glsl/link_uniform_block_active_visitor.h @@ -32,6 +32,7 @@ struct uniform_block_array_elements { unsigned num_array_elements; ir_dereference_array *ir; + unsigned original_dim_size; struct uniform_block_array_elements *array; }; diff --git a/src/compiler/glsl/link_uniform_blocks.cpp b/src/compiler/glsl/link_uniform_blocks.cpp index 0b890586298..1665fc3f8cb 100644 --- a/src/compiler/glsl/link_uniform_blocks.cpp +++ b/src/compiler/glsl/link_uniform_blocks.cpp @@ -244,18 +244,21 @@ process_block_array(struct uniform_block_array_elements *ub_array, char **name, for (unsigned j = 0; j < ub_array->num_array_elements; j++) { size_t new_length = name_length; + unsigned int element_idx = ub_array->array_elements[j]; /* Append the subscript to the current variable name */ - ralloc_asprintf_rewrite_tail(name, &new_length, "[%u]", - ub_array->array_elements[j]); + ralloc_asprintf_rewrite_tail(name, &new_length, "[%u]", element_idx); if (ub_array->array) { + unsigned boffset = (*binding_offset) + (element_idx * + ub_array->original_dim_size); process_block_array(ub_array->array, name, new_length, blocks, parcel, variables, b, block_index, - binding_offset, ctx, prog, first_index); + &boffset, ctx, prog, first_index); } else { + unsigned boffset = (*binding_offset) + element_idx; process_block_array_leaf(*name, blocks, parcel, variables, b, block_index, - binding_offset, *block_index - first_index, + &boffset, *block_index - first_index, ctx, prog); } } @@ -307,7 +310,6 @@ process_block_array_leaf(const char *name, (unsigned)(ptrdiff_t)(&variables[parcel->index] - blocks[i].Uniforms); *block_index = *block_index + 1; - *binding_offset = *binding_offset + 1; } /* This function resizes the array types of the block so that later we can use @@ -440,6 +442,7 @@ link_uniform_blocks(void *mem_ctx, GLSL_INTERFACE_PACKING_PACKED)) { b->type = resize_block_array(b->type, b->array); b->var->type = b->type; + b->var->data.max_array_access = b->type->length - 1; } block_size.num_active_uniforms = 0; diff --git a/src/compiler/glsl/link_uniforms.cpp b/src/compiler/glsl/link_uniforms.cpp index 63e688b19a7..13fc603ce7a 100644 --- a/src/compiler/glsl/link_uniforms.cpp +++ b/src/compiler/glsl/link_uniforms.cpp @@ -62,6 +62,15 @@ program_resource_visitor::process(const glsl_type *type, const char *name, void program_resource_visitor::process(ir_variable *var, bool use_std430_as_default) +{ + const glsl_type *t = + var->data.from_named_ifc_block ? var->get_interface_type() : var->type; + process(var, t, use_std430_as_default); +} + +void +program_resource_visitor::process(ir_variable *var, const glsl_type *var_type, + bool use_std430_as_default) { unsigned record_array_count = 1; const bool row_major = @@ -72,8 +81,7 @@ program_resource_visitor::process(ir_variable *var, bool use_std430_as_default) get_internal_ifc_packing(use_std430_as_default) : var->type->get_internal_ifc_packing(use_std430_as_default); - const glsl_type *t = - var->data.from_named_ifc_block ? var->get_interface_type() : var->type; + const glsl_type *t = var_type; const glsl_type *t_without_array = t->without_array(); /* false is always passed for the row_major parameter to the other diff --git a/src/compiler/glsl/link_varyings.cpp b/src/compiler/glsl/link_varyings.cpp index 3969c0120b3..28187e2f0a4 100644 --- a/src/compiler/glsl/link_varyings.cpp +++ b/src/compiler/glsl/link_varyings.cpp @@ -309,16 +309,16 @@ cross_validate_types_and_qualifiers(struct gl_context *ctx, * "The invariance of varyings that are declared in both the vertex * and fragment shaders must match." */ - if (input->data.invariant != output->data.invariant && + if (input->data.explicit_invariant != output->data.explicit_invariant && prog->data->Version < (prog->IsES ? 300 : 430)) { linker_error(prog, "%s shader output `%s' %s invariant qualifier, " "but %s shader input %s invariant qualifier\n", _mesa_shader_stage_to_string(producer_stage), output->name, - (output->data.invariant) ? "has" : "lacks", + (output->data.explicit_invariant) ? "has" : "lacks", _mesa_shader_stage_to_string(consumer_stage), - (input->data.invariant) ? "has" : "lacks"); + (input->data.explicit_invariant) ? "has" : "lacks"); return; } @@ -424,28 +424,14 @@ compute_variable_location_slot(ir_variable *var, gl_shader_stage stage) struct explicit_location_info { ir_variable *var; - unsigned numerical_type; + bool base_type_is_integer; + unsigned base_type_bit_size; unsigned interpolation; bool centroid; bool sample; bool patch; }; -static inline unsigned -get_numerical_type(const glsl_type *type) -{ - /* From the OpenGL 4.6 spec, section 4.4.1 Input Layout Qualifiers, Page 68, - * (Location aliasing): - * - * "Further, when location aliasing, the aliases sharing the location - * must have the same underlying numerical type (floating-point or - * integer) - */ - if (type->is_float() || type->is_double()) - return GLSL_TYPE_FLOAT; - return GLSL_TYPE_INT; -} - static bool check_location_aliasing(struct explicit_location_info explicit_locations[][4], ir_variable *var, @@ -461,14 +447,23 @@ check_location_aliasing(struct explicit_location_info explicit_locations[][4], gl_shader_stage stage) { unsigned last_comp; - if (type->without_array()->is_record()) { - /* The component qualifier can't be used on structs so just treat - * all component slots as used. + unsigned base_type_bit_size; + const glsl_type *type_without_array = type->without_array(); + const bool base_type_is_integer = + glsl_base_type_is_integer(type_without_array->base_type); + const bool is_struct = type_without_array->is_record(); + if (is_struct) { + /* structs don't have a defined underlying base type so just treat all + * component slots as used and set the bit size to 0. If there is + * location aliasing, we'll fail anyway later. */ last_comp = 4; + base_type_bit_size = 0; } else { - unsigned dmul = type->without_array()->is_64bit() ? 2 : 1; - last_comp = component + type->without_array()->vector_elements * dmul; + unsigned dmul = type_without_array->is_64bit() ? 2 : 1; + last_comp = component + type_without_array->vector_elements * dmul; + base_type_bit_size = + glsl_base_type_get_bit_size(type_without_array->base_type); } while (location < location_limit) { @@ -478,8 +473,22 @@ check_location_aliasing(struct explicit_location_info explicit_locations[][4], &explicit_locations[location][comp]; if (info->var) { - /* Component aliasing is not alloed */ - if (comp >= component && comp < last_comp) { + if (info->var->type->without_array()->is_record() || is_struct) { + /* Structs cannot share location since they are incompatible + * with any other underlying numerical type. + */ + linker_error(prog, + "%s shader has multiple %sputs sharing the " + "same location that don't have the same " + "underlying numerical type. Struct variable '%s', " + "location %u\n", + _mesa_shader_stage_to_string(stage), + var->data.mode == ir_var_shader_in ? "in" : "out", + is_struct ? var->name : info->var->name, + location); + return false; + } else if (comp >= component && comp < last_comp) { + /* Component aliasing is not allowed */ linker_error(prog, "%s shader has multiple %sputs explicitly " "assigned to location %d and component %d\n", @@ -488,27 +497,52 @@ check_location_aliasing(struct explicit_location_info explicit_locations[][4], location, comp); return false; } else { - /* For all other used components we need to have matching - * types, interpolation and auxiliary storage + /* From the OpenGL 4.60.5 spec, section 4.4.1 Input Layout + * Qualifiers, Page 67, (Location aliasing): + * + * " Further, when location aliasing, the aliases sharing the + * location must have the same underlying numerical type + * and bit width (floating-point or integer, 32-bit versus + * 64-bit, etc.) and the same auxiliary storage and + * interpolation qualification." */ - if (info->numerical_type != - get_numerical_type(type->without_array())) { + + /* If the underlying numerical type isn't integer, implicitly + * it will be float or else we would have failed by now. + */ + if (info->base_type_is_integer != base_type_is_integer) { linker_error(prog, - "Varyings sharing the same location must " - "have the same underlying numerical type. " - "Location %u component %u\n", - location, comp); + "%s shader has multiple %sputs sharing the " + "same location that don't have the same " + "underlying numerical type. Location %u " + "component %u.\n", + _mesa_shader_stage_to_string(stage), + var->data.mode == ir_var_shader_in ? + "in" : "out", location, comp); + return false; + } + + if (info->base_type_bit_size != base_type_bit_size) { + linker_error(prog, + "%s shader has multiple %sputs sharing the " + "same location that don't have the same " + "underlying numerical bit size. Location %u " + "component %u.\n", + _mesa_shader_stage_to_string(stage), + var->data.mode == ir_var_shader_in ? + "in" : "out", location, comp); return false; } if (info->interpolation != interpolation) { linker_error(prog, - "%s shader has multiple %sputs at explicit " - "location %u with different interpolation " - "settings\n", + "%s shader has multiple %sputs sharing the " + "same location that don't have the same " + "interpolation qualification. Location %u " + "component %u.\n", _mesa_shader_stage_to_string(stage), var->data.mode == ir_var_shader_in ? - "in" : "out", location); + "in" : "out", location, comp); return false; } @@ -516,17 +550,20 @@ check_location_aliasing(struct explicit_location_info explicit_locations[][4], info->sample != sample || info->patch != patch) { linker_error(prog, - "%s shader has multiple %sputs at explicit " - "location %u with different aux storage\n", + "%s shader has multiple %sputs sharing the " + "same location that don't have the same " + "auxiliary storage qualification. Location %u " + "component %u.\n", _mesa_shader_stage_to_string(stage), var->data.mode == ir_var_shader_in ? - "in" : "out", location); + "in" : "out", location, comp); return false; } } } else if (comp >= component && comp < last_comp) { info->var = var; - info->numerical_type = get_numerical_type(type->without_array()); + info->base_type_is_integer = base_type_is_integer; + info->base_type_bit_size = base_type_bit_size; info->interpolation = interpolation; info->centroid = centroid; info->sample = sample; @@ -773,8 +810,20 @@ cross_validate_outputs_to_inputs(struct gl_context *ctx, output = explicit_locations[idx][input->data.location_frac].var; - if (output == NULL || - input->data.location != output->data.location) { + if (output == NULL) { + /* A linker failure should only happen when there is no + * output declaration and there is Static Use of the + * declared input. + */ + if (input->data.used) { + linker_error(prog, + "%s shader input `%s' with explicit location " + "has no matching output\n", + _mesa_shader_stage_to_string(consumer->Stage), + input->name); + break; + } + } else if (input->data.location != output->data.location) { linker_error(prog, "%s shader input `%s' with explicit location " "has no matching output\n", @@ -804,7 +853,7 @@ cross_validate_outputs_to_inputs(struct gl_context *ctx, */ assert(!input->data.assigned); if (input->data.used && !input->get_interface_type() && - !input->data.explicit_location && !prog->SeparateShader) + !input->data.explicit_location) linker_error(prog, "%s shader input `%s' " "has no matching output in the previous stage\n", @@ -1166,8 +1215,7 @@ tfeedback_decl::store(struct gl_context *ctx, struct gl_shader_program *prog, return false; } - if ((this->offset / 4) / info->Buffers[buffer].Stride != - (xfb_offset - 1) / info->Buffers[buffer].Stride) { + if (xfb_offset > info->Buffers[buffer].Stride) { linker_error(prog, "xfb_offset (%d) overflows xfb_stride (%d) for " "buffer (%d)", xfb_offset * 4, info->Buffers[buffer].Stride * 4, buffer); @@ -2124,9 +2172,11 @@ class tfeedback_candidate_generator : public program_resource_visitor { public: tfeedback_candidate_generator(void *mem_ctx, - hash_table *tfeedback_candidates) + hash_table *tfeedback_candidates, + gl_shader_stage stage) : mem_ctx(mem_ctx), tfeedback_candidates(tfeedback_candidates), + stage(stage), toplevel_var(NULL), varying_floats(0) { @@ -2136,10 +2186,17 @@ class tfeedback_candidate_generator : public program_resource_visitor { /* All named varying interface blocks should be flattened by now */ assert(!var->is_interface_instance()); + assert(var->data.mode == ir_var_shader_out); this->toplevel_var = var; this->varying_floats = 0; - program_resource_visitor::process(var, false); + const glsl_type *t = + var->data.from_named_ifc_block ? var->get_interface_type() : var->type; + if (!var->data.patch && stage == MESA_SHADER_TESS_CTRL) { + assert(t->is_array()); + t = t->fields.array; + } + program_resource_visitor::process(var, t, false); } private: @@ -2173,6 +2230,8 @@ class tfeedback_candidate_generator : public program_resource_visitor */ hash_table * const tfeedback_candidates; + gl_shader_stage stage; + /** * Pointer to the toplevel variable that is being traversed. */ @@ -2503,8 +2562,28 @@ assign_varying_locations(struct gl_context *ctx, producer->Stage == MESA_SHADER_GEOMETRY)); if (num_tfeedback_decls > 0) { - tfeedback_candidate_generator g(mem_ctx, tfeedback_candidates); - g.process(output_var); + tfeedback_candidate_generator g(mem_ctx, tfeedback_candidates, producer->Stage); + /* From OpenGL 4.6 (Core Profile) spec, section 11.1.2.1 + * ("Vertex Shader Variables / Output Variables") + * + * "Each program object can specify a set of output variables from + * one shader to be recorded in transform feedback mode (see + * section 13.3). The variables that can be recorded are those + * emitted by the first active shader, in order, from the + * following list: + * + * * geometry shader + * * tessellation evaluation shader + * * tessellation control shader + * * vertex shader" + * + * But on OpenGL ES 3.2, section 11.1.2.1 ("Vertex Shader + * Variables / Output Variables") tessellation control shader is + * not included in the stages list. + */ + if (!prog->IsES || producer->Stage != MESA_SHADER_TESS_CTRL) { + g.process(output_var); + } } ir_variable *const input_var = diff --git a/src/compiler/glsl/linker.cpp b/src/compiler/glsl/linker.cpp index 2d76e852f47..0d9b1befdd5 100644 --- a/src/compiler/glsl/linker.cpp +++ b/src/compiler/glsl/linker.cpp @@ -1090,7 +1090,7 @@ cross_validate_globals(struct gl_context *ctx, struct gl_shader_program *prog, } } - if (existing->data.invariant != var->data.invariant) { + if (existing->data.explicit_invariant != var->data.explicit_invariant) { linker_error(prog, "declarations for %s `%s' have " "mismatching invariant qualifiers\n", mode_string(var), var->name); diff --git a/src/compiler/glsl/linker.h b/src/compiler/glsl/linker.h index f6fb00351d4..be92dbf983c 100644 --- a/src/compiler/glsl/linker.h +++ b/src/compiler/glsl/linker.h @@ -134,6 +134,26 @@ class program_resource_visitor { */ void process(ir_variable *var, bool use_std430_as_default); + /** + * Begin processing a variable + * + * Classes that overload this function should call \c ::process from the + * base class to start the recursive processing of the variable. + * + * \param var The variable that is to be processed + * \param var_type The glsl_type reference of the variable + * + * Calls \c ::visit_field for each leaf of the variable. + * + * \warning + * When processing a uniform block, this entry should only be used in cases + * where the row / column ordering of matrices in the block does not + * matter. For example, enumerating the names of members of the block, but + * not for determining the offsets of members. + */ + void process(ir_variable *var, const glsl_type *var_type, + bool use_std430_as_default); + /** * Begin processing a variable of a structured type. * diff --git a/src/compiler/glsl/list.h b/src/compiler/glsl/list.h index 59ed766f2e1..979f6fcc539 100644 --- a/src/compiler/glsl/list.h +++ b/src/compiler/glsl/list.h @@ -81,6 +81,12 @@ struct exec_node { * Insert a node in the list after the current node */ void insert_after(exec_node *after); + + /** + * Insert another list in the list after the current node + */ + void insert_after(struct exec_list *after); + /** * Insert a node in the list before the current node */ @@ -507,6 +513,21 @@ exec_list_append(struct exec_list *list, struct exec_list *source) exec_list_make_empty(source); } +static inline void +exec_node_insert_list_after(struct exec_node *n, struct exec_list *after) +{ + if (exec_list_is_empty(after)) + return; + + after->tail_sentinel.prev->next = n->next; + after->head_sentinel.next->prev = n; + + n->next->prev = after->tail_sentinel.prev; + n->next = after->head_sentinel.next; + + exec_list_make_empty(after); +} + static inline void exec_list_prepend(struct exec_list *list, struct exec_list *source) { @@ -635,6 +656,11 @@ inline void exec_list::append_list(exec_list *source) exec_list_append(this, source); } +inline void exec_node::insert_after(exec_list *after) +{ + exec_node_insert_list_after(this, after); +} + inline void exec_list::prepend_list(exec_list *source) { exec_list_prepend(this, source); diff --git a/src/compiler/glsl/lower_vector_derefs.cpp b/src/compiler/glsl/lower_vector_derefs.cpp index 6cd9a2d819a..2aae30d8201 100644 --- a/src/compiler/glsl/lower_vector_derefs.cpp +++ b/src/compiler/glsl/lower_vector_derefs.cpp @@ -32,8 +32,9 @@ namespace { class vector_deref_visitor : public ir_rvalue_enter_visitor { public: - vector_deref_visitor() - : progress(false) + vector_deref_visitor(void *mem_ctx, gl_shader_stage shader_stage) + : progress(false), shader_stage(shader_stage), + factory(&factory_instructions, mem_ctx) { } @@ -45,6 +46,9 @@ class vector_deref_visitor : public ir_rvalue_enter_visitor { virtual ir_visitor_status visit_enter(ir_assignment *ir); bool progress; + gl_shader_stage shader_stage; + exec_list factory_instructions; + ir_factory factory; }; } /* anonymous namespace */ @@ -65,13 +69,63 @@ vector_deref_visitor::visit_enter(ir_assignment *ir) ir_constant *old_index_constant = deref->array_index->constant_expression_value(mem_ctx); if (!old_index_constant) { - ir->rhs = new(mem_ctx) ir_expression(ir_triop_vector_insert, - new_lhs->type, - new_lhs->clone(mem_ctx, NULL), - ir->rhs, - deref->array_index); - ir->write_mask = (1 << new_lhs->type->vector_elements) - 1; - ir->set_lhs(new_lhs); + if (shader_stage == MESA_SHADER_TESS_CTRL && + deref->variable_referenced()->data.mode == ir_var_shader_out) { + /* Tessellation control shader outputs act as if they have memory + * backing them and if we have writes from multiple threads + * targeting the same vec4 (this can happen for patch outputs), the + * load-vec-store pattern of ir_triop_vector_insert doesn't work. + * Instead, we have to lower to a series of conditional write-masked + * assignments. + */ + ir_variable *const src_temp = + factory.make_temp(ir->rhs->type, "scalar_tmp"); + + /* The newly created variable declaration goes before the assignment + * because we're going to set it as the new LHS. + */ + ir->insert_before(factory.instructions); + ir->set_lhs(new(mem_ctx) ir_dereference_variable(src_temp)); + + ir_variable *const arr_index = + factory.make_temp(deref->array_index->type, "index_tmp"); + factory.emit(assign(arr_index, deref->array_index)); + + for (unsigned i = 0; i < new_lhs->type->vector_elements; i++) { + ir_constant *const cmp_index = + ir_constant::zero(factory.mem_ctx, deref->array_index->type); + cmp_index->value.u[0] = i; + + ir_rvalue *const lhs_clone = new_lhs->clone(factory.mem_ctx, NULL); + ir_dereference_variable *const src_temp_deref = + new(mem_ctx) ir_dereference_variable(src_temp); + + if (new_lhs->ir_type != ir_type_swizzle) { + assert(lhs_clone->as_dereference()); + ir_assignment *cond_assign = + new(mem_ctx) ir_assignment(lhs_clone->as_dereference(), + src_temp_deref, + equal(arr_index, cmp_index), + WRITEMASK_X << i); + factory.emit(cond_assign); + } else { + ir_assignment *cond_assign = + new(mem_ctx) ir_assignment(swizzle(lhs_clone, i, 1), + src_temp_deref, + equal(arr_index, cmp_index)); + factory.emit(cond_assign); + } + } + ir->insert_after(factory.instructions); + } else { + ir->rhs = new(mem_ctx) ir_expression(ir_triop_vector_insert, + new_lhs->type, + new_lhs->clone(mem_ctx, NULL), + ir->rhs, + deref->array_index); + ir->write_mask = (1 << new_lhs->type->vector_elements) - 1; + ir->set_lhs(new_lhs); + } } else if (new_lhs->ir_type != ir_type_swizzle) { ir->set_lhs(new_lhs); ir->write_mask = 1 << old_index_constant->get_uint_component(0); @@ -105,7 +159,7 @@ vector_deref_visitor::handle_rvalue(ir_rvalue **rv) bool lower_vector_derefs(gl_linked_shader *shader) { - vector_deref_visitor v; + vector_deref_visitor v(shader->ir, shader->Stage); visit_list_elements(&v, shader->ir); diff --git a/src/compiler/glsl/serialize.cpp b/src/compiler/glsl/serialize.cpp index fdd99ec59da..ad258f8bcb1 100644 --- a/src/compiler/glsl/serialize.cpp +++ b/src/compiler/glsl/serialize.cpp @@ -996,15 +996,14 @@ write_shader_parameters(struct blob *metadata, struct gl_program_parameter_list *params) { blob_write_uint32(metadata, params->NumParameters); - blob_write_uint32(metadata, params->NumParameterValues); uint32_t i = 0; while (i < params->NumParameters) { struct gl_program_parameter *param = ¶ms->Parameters[i]; - blob_write_uint32(metadata, param->Type); blob_write_string(metadata, param->Name); blob_write_uint32(metadata, param->Size); + blob_write_uint32(metadata, param->Padded); blob_write_uint32(metadata, param->DataType); blob_write_bytes(metadata, param->StateIndexes, sizeof(param->StateIndexes)); @@ -1015,9 +1014,6 @@ write_shader_parameters(struct blob *metadata, blob_write_bytes(metadata, params->ParameterValues, sizeof(gl_constant_value) * params->NumParameterValues); - blob_write_bytes(metadata, params->ParameterValueOffset, - sizeof(uint32_t) * params->NumParameters); - blob_write_uint32(metadata, params->StateFlags); } @@ -1028,28 +1024,25 @@ read_shader_parameters(struct blob_reader *metadata, gl_state_index16 state_indexes[STATE_LENGTH]; uint32_t i = 0; uint32_t num_parameters = blob_read_uint32(metadata); - uint32_t num_parameters_values = blob_read_uint32(metadata); _mesa_reserve_parameter_storage(params, num_parameters); while (i < num_parameters) { gl_register_file type = (gl_register_file) blob_read_uint32(metadata); const char *name = blob_read_string(metadata); unsigned size = blob_read_uint32(metadata); + bool padded = blob_read_uint32(metadata); unsigned data_type = blob_read_uint32(metadata); blob_copy_bytes(metadata, (uint8_t *) state_indexes, sizeof(state_indexes)); _mesa_add_parameter(params, type, name, size, data_type, - NULL, state_indexes, false); + NULL, state_indexes, padded); i++; } blob_copy_bytes(metadata, (uint8_t *) params->ParameterValues, - sizeof(gl_constant_value) * num_parameters_values); - - blob_copy_bytes(metadata, (uint8_t *) params->ParameterValueOffset, - sizeof(uint32_t) * num_parameters); + sizeof(gl_constant_value) * params->NumParameterValues); params->StateFlags = blob_read_uint32(metadata); } diff --git a/src/compiler/glsl_types.cpp b/src/compiler/glsl_types.cpp index 90f4548030f..042f45a926d 100644 --- a/src/compiler/glsl_types.cpp +++ b/src/compiler/glsl_types.cpp @@ -260,6 +260,22 @@ glsl_type::contains_double() const } } +bool +glsl_type::contains_64bit() const +{ + if (this->is_array()) { + return this->fields.array->contains_64bit(); + } else if (this->is_record() || this->is_interface()) { + for (unsigned int i = 0; i < this->length; i++) { + if (this->fields.structure[i].type->contains_64bit()) + return true; + } + return false; + } else { + return this->is_64bit(); + } +} + bool glsl_type::contains_opaque() const { switch (base_type) { diff --git a/src/compiler/glsl_types.h b/src/compiler/glsl_types.h index bdaeee7ddd7..4767d197449 100644 --- a/src/compiler/glsl_types.h +++ b/src/compiler/glsl_types.h @@ -31,6 +31,7 @@ #include "shader_enums.h" #include "blob.h" #include "c11/threads.h" +#include "util/macros.h" #ifdef __cplusplus #include "main/config.h" @@ -114,6 +115,42 @@ static inline bool glsl_base_type_is_integer(enum glsl_base_type type) type == GLSL_TYPE_IMAGE; } +static inline unsigned int +glsl_base_type_get_bit_size(const enum glsl_base_type base_type) +{ + switch (base_type) { + case GLSL_TYPE_BOOL: + return 1; + + case GLSL_TYPE_INT: + case GLSL_TYPE_UINT: + case GLSL_TYPE_FLOAT: /* TODO handle mediump */ + case GLSL_TYPE_SUBROUTINE: + return 32; + + case GLSL_TYPE_FLOAT16: + case GLSL_TYPE_UINT16: + case GLSL_TYPE_INT16: + return 16; + + case GLSL_TYPE_UINT8: + case GLSL_TYPE_INT8: + return 8; + + case GLSL_TYPE_DOUBLE: + case GLSL_TYPE_INT64: + case GLSL_TYPE_UINT64: + case GLSL_TYPE_IMAGE: + case GLSL_TYPE_SAMPLER: + return 64; + + default: + unreachable("unknown base type"); + } + + return 0; +} + enum glsl_sampler_dim { GLSL_SAMPLER_DIM_1D = 0, GLSL_SAMPLER_DIM_2D, @@ -544,6 +581,12 @@ struct glsl_type { */ bool contains_double() const; + /** + * Query whether or not type is a 64-bit type, or for struct, interface and + * array types, contains a double type. + */ + bool contains_64bit() const; + /** * Query whether or not a type is a float type */ diff --git a/src/compiler/nir/meson.build b/src/compiler/nir/meson.build index 20a26a26255..e6784fcd41f 100644 --- a/src/compiler/nir/meson.build +++ b/src/compiler/nir/meson.build @@ -112,6 +112,7 @@ files_libnir = files( 'nir_lower_alu.c', 'nir_lower_alu_to_scalar.c', 'nir_lower_alpha_test.c', + 'nir_lower_array_deref_of_vec.c', 'nir_lower_atomics_to_ssbo.c', 'nir_lower_bitmap.c', 'nir_lower_bool_to_float.c', @@ -133,6 +134,7 @@ files_libnir = files( 'nir_lower_io_arrays_to_elements.c', 'nir_lower_io_to_temporaries.c', 'nir_lower_io_to_scalar.c', + 'nir_lower_io_to_vector.c', 'nir_lower_packing.c', 'nir_lower_passthrough_edgeflags.c', 'nir_lower_patch_vertices.c', diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index ff2c41faf27..c43226ba8df 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -2825,7 +2825,7 @@ should_print_nir(void) static inline void nir_validate_shader(nir_shader *shader, const char *when) { (void) shader; (void)when; } static inline void nir_metadata_set_validation_flag(nir_shader *shader) { (void) shader; } static inline void nir_metadata_check_validation_flag(nir_shader *shader) { (void) shader; } -static inline bool should_skip_nir(const char *pass_name) { return false; } +static inline bool should_skip_nir(UNUSED const char *pass_name) { return false; } static inline bool should_clone_nir(void) { return false; } static inline bool should_serialize_deserialize_nir(void) { return false; } static inline bool should_print_nir(void) { return false; } @@ -2910,6 +2910,16 @@ void nir_fixup_deref_modes(nir_shader *shader); bool nir_lower_global_vars_to_local(nir_shader *shader); +typedef enum { + nir_lower_direct_array_deref_of_vec_load = (1 << 0), + nir_lower_indirect_array_deref_of_vec_load = (1 << 1), + nir_lower_direct_array_deref_of_vec_store = (1 << 2), + nir_lower_indirect_array_deref_of_vec_store = (1 << 3), +} nir_lower_array_deref_of_vec_options; + +bool nir_lower_array_deref_of_vec(nir_shader *shader, nir_variable_mode modes, + nir_lower_array_deref_of_vec_options options); + bool nir_lower_indirect_derefs(nir_shader *shader, nir_variable_mode modes); bool nir_lower_locals_to_regs(nir_shader *shader); @@ -2998,6 +3008,7 @@ void nir_lower_io_arrays_to_elements_no_indirects(nir_shader *shader, bool outputs_only); void nir_lower_io_to_scalar(nir_shader *shader, nir_variable_mode mask); void nir_lower_io_to_scalar_early(nir_shader *shader, nir_variable_mode mask); +bool nir_lower_io_to_vector(nir_shader *shader, nir_variable_mode mask); typedef struct nir_lower_subgroups_options { uint8_t subgroup_size; @@ -3090,6 +3101,9 @@ typedef struct nir_lower_tex_options { */ uint8_t swizzles[32][4]; + /* Can be used to scale sampled values in range required by the format. */ + float scale_factors[32]; + /** * Bitmap of textures that need srgb to linear conversion. If * (lower_srgb & (1 << texture_index)) then the rgb (xyz) components @@ -3138,6 +3152,12 @@ typedef struct nir_lower_tex_options { */ bool lower_txd_offset_clamp; + /** + * If true, lower nir_texop_txd with min_lod to a nir_texop_txl if the + * sampler index is not statically determinable to be less than 16. + */ + bool lower_txd_clamp_if_sampler_index_not_lt_16; + /** * If true, apply a .bagr swizzle on tg4 results to handle Broadcom's * mixed-up tg4 locations. @@ -3316,7 +3336,7 @@ bool nir_opt_move_comparisons(nir_shader *shader); bool nir_opt_move_load_ubo(nir_shader *shader); bool nir_opt_peephole_select(nir_shader *shader, unsigned limit, - bool indirect_load_ok, bool expensive_alu_ok); + bool indirect_load_ok); bool nir_opt_remove_phis(nir_shader *shader); diff --git a/src/compiler/nir/nir_builder.h b/src/compiler/nir/nir_builder.h index 2a36eb3c91b..101bc7ad637 100644 --- a/src/compiler/nir/nir_builder.h +++ b/src/compiler/nir/nir_builder.h @@ -560,6 +560,35 @@ nir_channels(nir_builder *b, nir_ssa_def *def, nir_component_mask_t mask) return nir_swizzle(b, def, swizzle, num_channels, false); } +static inline nir_ssa_def * +_nir_vector_extract_helper(nir_builder *b, nir_ssa_def *vec, nir_ssa_def *c, + unsigned start, unsigned end) +{ + if (start == end - 1) { + return nir_channel(b, vec, start); + } else { + unsigned mid = start + (end - start) / 2; + return nir_bcsel(b, nir_ilt(b, c, nir_imm_int(b, mid)), + _nir_vector_extract_helper(b, vec, c, start, mid), + _nir_vector_extract_helper(b, vec, c, mid, end)); + } +} + +static inline nir_ssa_def * +nir_vector_extract(nir_builder *b, nir_ssa_def *vec, nir_ssa_def *c) +{ + nir_src c_src = nir_src_for_ssa(c); + if (nir_src_is_const(c_src)) { + unsigned c_const = nir_src_as_uint(c_src); + if (c_const < vec->num_components) + return nir_channel(b, vec, c_const); + else + return nir_ssa_undef(b, 1, vec->bit_size); + } else { + return _nir_vector_extract_helper(b, vec, c, 0, vec->num_components); + } +} + static inline nir_ssa_def * nir_i2i(nir_builder *build, nir_ssa_def *x, unsigned dest_bit_size) { diff --git a/src/compiler/nir/nir_clone.c b/src/compiler/nir/nir_clone.c index 557c7d29f53..24bef4f523a 100644 --- a/src/compiler/nir/nir_clone.c +++ b/src/compiler/nir/nir_clone.c @@ -151,9 +151,11 @@ nir_variable_clone(const nir_variable *var, nir_shader *shader) nvar->name = ralloc_strdup(nvar, var->name); nvar->data = var->data; nvar->num_state_slots = var->num_state_slots; - nvar->state_slots = ralloc_array(nvar, nir_state_slot, var->num_state_slots); - memcpy(nvar->state_slots, var->state_slots, - var->num_state_slots * sizeof(nir_state_slot)); + if (var->num_state_slots) { + nvar->state_slots = ralloc_array(nvar, nir_state_slot, var->num_state_slots); + memcpy(nvar->state_slots, var->state_slots, + var->num_state_slots * sizeof(nir_state_slot)); + } if (var->constant_initializer) { nvar->constant_initializer = nir_constant_clone(var->constant_initializer, nvar); diff --git a/src/compiler/nir/nir_deref.c b/src/compiler/nir/nir_deref.c index 2f5fda643ca..1e321a66208 100644 --- a/src/compiler/nir/nir_deref.c +++ b/src/compiler/nir/nir_deref.c @@ -215,7 +215,7 @@ nir_build_deref_offset(nir_builder *b, nir_deref_instr *deref, unsigned field_offset = struct_type_get_field_offset(parent->type, size_align, (*p)->strct.index); - nir_iadd(b, offset, nir_imm_int(b, field_offset)); + offset = nir_iadd(b, offset, nir_imm_int(b, field_offset)); } else { unreachable("Unsupported deref type"); } @@ -574,10 +574,9 @@ nir_rematerialize_derefs_in_use_blocks_impl(nir_function_impl *impl) _mesa_hash_table_clear(state.cache, NULL); nir_foreach_instr_safe(instr, block) { - if (instr->type == nir_instr_type_deref) { - nir_deref_instr_remove_if_unused(nir_instr_as_deref(instr)); + if (instr->type == nir_instr_type_deref && + nir_deref_instr_remove_if_unused(nir_instr_as_deref(instr))) continue; - } state.builder.cursor = nir_before_instr(instr); nir_foreach_src(instr, rematerialize_deref_src, &state); diff --git a/src/compiler/nir/nir_gather_xfb_info.c b/src/compiler/nir/nir_gather_xfb_info.c index 96f0ece5e75..f2a2c0e6b99 100644 --- a/src/compiler/nir/nir_gather_xfb_info.c +++ b/src/compiler/nir/nir_gather_xfb_info.c @@ -33,7 +33,11 @@ add_var_xfb_outputs(nir_xfb_info *xfb, unsigned *offset, const struct glsl_type *type) { - if (glsl_type_is_array(type) || glsl_type_is_matrix(type)) { + /* If this type contains a 64-bit value, align to 8 bytes */ + if (glsl_type_contains_64bit(type)) + *offset = ALIGN_POT(*offset, 8); + + if (glsl_type_is_array_or_matrix(type) && !var->data.compact) { unsigned length = glsl_get_length(type); const struct glsl_type *child_type = glsl_get_array_element(type); for (unsigned i = 0; i < length; i++) @@ -58,32 +62,43 @@ add_var_xfb_outputs(nir_xfb_info *xfb, assert(var->data.stream < NIR_MAX_XFB_STREAMS); xfb->streams_written |= (1 << var->data.stream); - unsigned comp_slots = glsl_get_component_slots(type); - unsigned attrib_slots = DIV_ROUND_UP(comp_slots, 4); - assert(attrib_slots == glsl_count_attribute_slots(type, false)); - - /* Ensure that we don't have, for instance, a dvec2 with a location_frac - * of 2 which would make it crass a location boundary even though it - * fits in a single slot. However, you can have a dvec3 which crosses - * the slot boundary with a location_frac of 2. - */ - assert(DIV_ROUND_UP(var->data.location_frac + comp_slots, 4) == attrib_slots); + unsigned comp_slots; + if (var->data.compact) { + /* This only happens for clip/cull which are float arrays */ + assert(glsl_without_array(type) == glsl_float_type()); + assert(var->data.location == VARYING_SLOT_CLIP_DIST0 || + var->data.location == VARYING_SLOT_CLIP_DIST1); + comp_slots = glsl_get_length(type); + } else { + comp_slots = glsl_get_component_slots(type); + + unsigned attrib_slots = DIV_ROUND_UP(comp_slots, 4); + assert(attrib_slots == glsl_count_attribute_slots(type, false)); + + /* Ensure that we don't have, for instance, a dvec2 with a + * location_frac of 2 which would make it crass a location boundary + * even though it fits in a single slot. However, you can have a + * dvec3 which crosses the slot boundary with a location_frac of 2. + */ + assert(DIV_ROUND_UP(var->data.location_frac + comp_slots, 4) == + attrib_slots); + } assert(var->data.location_frac + comp_slots <= 8); uint8_t comp_mask = ((1 << comp_slots) - 1) << var->data.location_frac; - assert(attrib_slots <= 2); - for (unsigned s = 0; s < attrib_slots; s++) { + while (comp_mask) { nir_xfb_output_info *output = &xfb->outputs[xfb->output_count++]; output->buffer = buffer; - output->offset = *offset + s * 16; + output->offset = *offset; output->location = *location; - output->component_mask = (comp_mask >> (s * 4)) & 0xf; + output->component_mask = comp_mask & 0xf; + *offset += util_bitcount(output->component_mask) * 4; (*location)++; + comp_mask >>= 4; } - *offset += comp_slots * 4; } } diff --git a/src/compiler/nir/nir_linking_helpers.c b/src/compiler/nir/nir_linking_helpers.c index aaa4204cce9..764fd6d443e 100644 --- a/src/compiler/nir/nir_linking_helpers.c +++ b/src/compiler/nir/nir_linking_helpers.c @@ -59,6 +59,15 @@ get_variable_io_mask(nir_variable *var, gl_shader_stage stage) return ((1ull << slots) - 1) << location; } +static uint8_t +get_num_components(nir_variable *var) +{ + if (glsl_type_is_struct(glsl_without_array(var->type))) + return 4; + + return glsl_get_vector_elements(glsl_without_array(var->type)); +} + static void tcs_add_output_reads(nir_shader *shader, uint64_t *read, uint64_t *patches_read) { @@ -80,12 +89,14 @@ tcs_add_output_reads(nir_shader *shader, uint64_t *read, uint64_t *patches_read) continue; nir_variable *var = nir_deref_instr_get_variable(deref); - if (var->data.patch) { - patches_read[var->data.location_frac] |= - get_variable_io_mask(var, shader->info.stage); - } else { - read[var->data.location_frac] |= - get_variable_io_mask(var, shader->info.stage); + for (unsigned i = 0; i < get_num_components(var); i++) { + if (var->data.patch) { + patches_read[var->data.location_frac + i] |= + get_variable_io_mask(var, shader->info.stage); + } else { + read[var->data.location_frac + i] |= + get_variable_io_mask(var, shader->info.stage); + } } } } @@ -161,22 +172,26 @@ nir_remove_unused_varyings(nir_shader *producer, nir_shader *consumer) uint64_t patches_read[4] = { 0 }, patches_written[4] = { 0 }; nir_foreach_variable(var, &producer->outputs) { - if (var->data.patch) { - patches_written[var->data.location_frac] |= - get_variable_io_mask(var, producer->info.stage); - } else { - written[var->data.location_frac] |= - get_variable_io_mask(var, producer->info.stage); + for (unsigned i = 0; i < get_num_components(var); i++) { + if (var->data.patch) { + patches_written[var->data.location_frac + i] |= + get_variable_io_mask(var, producer->info.stage); + } else { + written[var->data.location_frac + i] |= + get_variable_io_mask(var, producer->info.stage); + } } } nir_foreach_variable(var, &consumer->inputs) { - if (var->data.patch) { - patches_read[var->data.location_frac] |= - get_variable_io_mask(var, consumer->info.stage); - } else { - read[var->data.location_frac] |= - get_variable_io_mask(var, consumer->info.stage); + for (unsigned i = 0; i < get_num_components(var); i++) { + if (var->data.patch) { + patches_read[var->data.location_frac + i] |= + get_variable_io_mask(var, consumer->info.stage); + } else { + read[var->data.location_frac + i] |= + get_variable_io_mask(var, consumer->info.stage); + } } } diff --git a/src/compiler/nir/nir_lower_array_deref_of_vec.c b/src/compiler/nir/nir_lower_array_deref_of_vec.c new file mode 100644 index 00000000000..2a70dd1ddbc --- /dev/null +++ b/src/compiler/nir/nir_lower_array_deref_of_vec.c @@ -0,0 +1,190 @@ +/* + * Copyright © 2019 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "nir.h" +#include "nir_builder.h" + +static void +build_write_masked_store(nir_builder *b, nir_deref_instr *vec_deref, + nir_ssa_def *value, unsigned component) +{ + assert(value->num_components == 1); + unsigned num_components = glsl_get_components(vec_deref->type); + assert(num_components > 1 && num_components <= NIR_MAX_VEC_COMPONENTS); + + nir_ssa_def *u = nir_ssa_undef(b, 1, value->bit_size); + nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS]; + for (unsigned i = 0; i < num_components; i++) + comps[i] = (i == component) ? value : u; + + nir_ssa_def *vec = nir_vec(b, comps, num_components); + nir_store_deref(b, vec_deref, vec, (1u << component)); +} + +static void +build_write_masked_stores(nir_builder *b, nir_deref_instr *vec_deref, + nir_ssa_def *value, nir_ssa_def *index, + unsigned start, unsigned end) +{ + if (start == end - 1) { + build_write_masked_store(b, vec_deref, value, start); + } else { + unsigned mid = start + (end - start) / 2; + nir_push_if(b, nir_ilt(b, index, nir_imm_int(b, mid))); + build_write_masked_stores(b, vec_deref, value, index, start, mid); + nir_push_else(b, NULL); + build_write_masked_stores(b, vec_deref, value, index, mid, end); + nir_pop_if(b, NULL); + } +} + +static bool +nir_lower_array_deref_of_vec_impl(nir_function_impl *impl, + nir_variable_mode modes, + nir_lower_array_deref_of_vec_options options) +{ + bool progress = false; + + nir_builder b; + nir_builder_init(&b, impl); + + nir_foreach_block(block, impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + assert(intrin->intrinsic != nir_intrinsic_copy_deref); + + if (intrin->intrinsic != nir_intrinsic_load_deref && + intrin->intrinsic != nir_intrinsic_interp_deref_at_centroid && + intrin->intrinsic != nir_intrinsic_interp_deref_at_sample && + intrin->intrinsic != nir_intrinsic_interp_deref_at_offset && + intrin->intrinsic != nir_intrinsic_store_deref) + continue; + + nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); + if (!(deref->mode & modes)) + continue; + + /* We only care about array derefs that act on vectors */ + if (deref->deref_type != nir_deref_type_array) + continue; + + nir_deref_instr *vec_deref = nir_deref_instr_parent(deref); + if (!glsl_type_is_vector(vec_deref->type)) + continue; + + assert(intrin->num_components == 1); + unsigned num_components = glsl_get_components(vec_deref->type); + assert(num_components > 1 && num_components <= NIR_MAX_VEC_COMPONENTS); + + b.cursor = nir_after_instr(&intrin->instr); + + if (intrin->intrinsic == nir_intrinsic_store_deref) { + assert(intrin->src[1].is_ssa); + nir_ssa_def *value = intrin->src[1].ssa; + + if (nir_src_is_const(deref->arr.index)) { + if (!(options & nir_lower_direct_array_deref_of_vec_store)) + continue; + + unsigned index = nir_src_as_uint(deref->arr.index); + /* If index is OOB, we throw the old store away and don't + * replace it with anything. + */ + if (index < num_components) + build_write_masked_store(&b, vec_deref, value, index); + } else { + if (!(options & nir_lower_indirect_array_deref_of_vec_store)) + continue; + + nir_ssa_def *index = nir_ssa_for_src(&b, deref->arr.index, 1); + build_write_masked_stores(&b, vec_deref, value, index, + 0, num_components); + } + nir_instr_remove(&intrin->instr); + + progress = true; + } else { + if (nir_src_is_const(deref->arr.index)) { + if (!(options & nir_lower_direct_array_deref_of_vec_load)) + continue; + } else { + if (!(options & nir_lower_indirect_array_deref_of_vec_load)) + continue; + } + + /* Turn the load into a vector load */ + nir_instr_rewrite_src(&intrin->instr, &intrin->src[0], + nir_src_for_ssa(&vec_deref->dest.ssa)); + intrin->dest.ssa.num_components = num_components; + intrin->num_components = num_components; + + nir_ssa_def *index = nir_ssa_for_src(&b, deref->arr.index, 1); + nir_ssa_def *scalar = + nir_vector_extract(&b, &intrin->dest.ssa, index); + if (scalar->parent_instr->type == nir_instr_type_ssa_undef) { + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, + nir_src_for_ssa(scalar)); + nir_instr_remove(&intrin->instr); + } else { + nir_ssa_def_rewrite_uses_after(&intrin->dest.ssa, + nir_src_for_ssa(scalar), + scalar->parent_instr); + } + progress = true; + } + } + } + + if (progress) { + nir_metadata_preserve(impl, nir_metadata_block_index | + nir_metadata_dominance); + } + + return progress; +} + +/* Lowers away array dereferences on vectors + * + * These are allowed on certain variable types such as SSBOs and TCS outputs. + * However, not everyone can actually handle them everywhere. There are also + * cases where we want to lower them for performance reasons. + * + * This patch assumes that copy_deref instructions have already been lowered. + */ +bool +nir_lower_array_deref_of_vec(nir_shader *shader, nir_variable_mode modes, + nir_lower_array_deref_of_vec_options options) +{ + bool progress = false; + + nir_foreach_function(function, shader) { + if (function->impl && + nir_lower_array_deref_of_vec_impl(function->impl, modes, options)) + progress = true; + } + + return progress; +} diff --git a/src/compiler/nir/nir_lower_clip_cull_distance_arrays.c b/src/compiler/nir/nir_lower_clip_cull_distance_arrays.c index 6e1557ef40d..b7cd7c50b11 100644 --- a/src/compiler/nir/nir_lower_clip_cull_distance_arrays.c +++ b/src/compiler/nir/nir_lower_clip_cull_distance_arrays.c @@ -27,10 +27,10 @@ /** * @file * - * This pass combines separate clip and cull distance arrays into a - * single array that contains both. Clip distances come first, then - * cull distances. It also populates nir_shader_info with the size - * of the original arrays so the driver knows which are which. + * This pass combines clip and cull distance arrays in separate locations and + * colocates them both in VARYING_SLOT_CLIP_DIST0. It does so by maintaining + * two arrays but making them compact and using location_frac to stack them on + * top of each other. */ /** @@ -56,77 +56,6 @@ get_unwrapped_array_length(nir_shader *nir, nir_variable *var) return glsl_get_length(type); } -/** - * Update the type of the combined array (including interface block nesting). - */ -static void -update_type(nir_variable *var, gl_shader_stage stage, unsigned length) -{ - const struct glsl_type *type = glsl_array_type(glsl_float_type(), length, 0); - - if (nir_is_per_vertex_io(var, stage)) - type = glsl_array_type(type, glsl_get_length(var->type), 0); - - var->type = type; -} - -static void -rewrite_clip_cull_deref(nir_builder *b, - nir_deref_instr *deref, - const struct glsl_type *type, - unsigned tail_offset) -{ - deref->type = type; - - if (glsl_type_is_array(type)) { - const struct glsl_type *child_type = glsl_get_array_element(type); - nir_foreach_use(src, &deref->dest.ssa) { - rewrite_clip_cull_deref(b, nir_instr_as_deref(src->parent_instr), - child_type, tail_offset); - } - } else { - assert(glsl_type_is_scalar(type)); - - /* This is the end of the line. Add the tail offset if needed */ - if (tail_offset > 0) { - b->cursor = nir_before_instr(&deref->instr); - assert(deref->deref_type == nir_deref_type_array); - nir_ssa_def *index = nir_iadd(b, deref->arr.index.ssa, - nir_imm_int(b, tail_offset)); - nir_instr_rewrite_src(&deref->instr, &deref->arr.index, - nir_src_for_ssa(index)); - } - } -} - -static void -rewrite_references(nir_builder *b, - nir_instr *instr, - nir_variable *combined, - unsigned cull_offset) -{ - if (instr->type != nir_instr_type_deref) - return; - - nir_deref_instr *deref = nir_instr_as_deref(instr); - if (deref->deref_type != nir_deref_type_var) - return; - - if (deref->var->data.mode != combined->data.mode) - return; - - const unsigned location = deref->var->data.location; - if (location != VARYING_SLOT_CLIP_DIST0 && - location != VARYING_SLOT_CULL_DIST0) - return; - - deref->var = combined; - if (location == VARYING_SLOT_CULL_DIST0) - rewrite_clip_cull_deref(b, deref, combined->type, cull_offset); - else - rewrite_clip_cull_deref(b, deref, combined->type, 0); -} - static bool combine_clip_cull(nir_shader *nir, struct exec_list *vars, @@ -134,7 +63,6 @@ combine_clip_cull(nir_shader *nir, { nir_variable *cull = NULL; nir_variable *clip = NULL; - bool progress = false; nir_foreach_variable(var, vars) { if (var->data.location == VARYING_SLOT_CLIP_DIST0) @@ -144,7 +72,9 @@ combine_clip_cull(nir_shader *nir, cull = var; } - /* if the GLSL lowering pass has already run, don't bother repeating */ + if (!cull && !clip) + return false; + if (!cull && clip) { if (!glsl_type_is_array(clip->type)) return false; @@ -158,50 +88,29 @@ combine_clip_cull(nir_shader *nir, nir->info.cull_distance_array_size = cull_array_size; } - if (clip) - clip->data.compact = true; - - if (cull) - cull->data.compact = true; - - if (cull_array_size > 0) { - if (clip_array_size == 0) { - /* No clip distances, just change the cull distance location */ - cull->data.location = VARYING_SLOT_CLIP_DIST0; - } else { - /* Turn the ClipDistance array into a combined one */ - update_type(clip, nir->info.stage, clip_array_size + cull_array_size); - - /* Rewrite CullDistance to reference the combined array */ - nir_foreach_function(function, nir) { - if (function->impl) { - nir_builder b; - nir_builder_init(&b, function->impl); - - nir_foreach_block(block, function->impl) { - nir_foreach_instr(instr, block) { - rewrite_references(&b, instr, clip, clip_array_size); - } - } - } - } - - /* Delete the old CullDistance variable */ - exec_node_remove(&cull->node); - ralloc_free(cull); - } + if (clip) { + assert(clip->data.compact); + clip->data.how_declared = nir_var_hidden; + } - nir_foreach_function(function, nir) { - if (function->impl) { - nir_metadata_preserve(function->impl, - nir_metadata_block_index | - nir_metadata_dominance); - } + if (cull) { + assert(cull->data.compact); + cull->data.how_declared = nir_var_hidden; + cull->data.location = VARYING_SLOT_CLIP_DIST0 + clip_array_size / 4; + cull->data.location_frac = clip_array_size % 4; + } + + nir_foreach_function(function, nir) { + if (function->impl) { + nir_metadata_preserve(function->impl, + nir_metadata_block_index | + nir_metadata_dominance | + nir_metadata_live_ssa_defs | + nir_metadata_loop_analysis); } - progress = true; } - return progress; + return true; } bool diff --git a/src/compiler/nir/nir_lower_io_to_temporaries.c b/src/compiler/nir/nir_lower_io_to_temporaries.c index 7602637d428..d2b069d3d68 100644 --- a/src/compiler/nir/nir_lower_io_to_temporaries.c +++ b/src/compiler/nir/nir_lower_io_to_temporaries.c @@ -85,7 +85,8 @@ emit_output_copies_impl(struct lower_io_state *state, nir_function_impl *impl) continue; nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); - if (intrin->intrinsic == nir_intrinsic_emit_vertex) { + if (intrin->intrinsic == nir_intrinsic_emit_vertex || + intrin->intrinsic == nir_intrinsic_emit_vertex_with_counter) { b.cursor = nir_before_instr(&intrin->instr); emit_copies(&b, &state->shader->outputs, &state->old_outputs); } diff --git a/src/compiler/nir/nir_lower_io_to_vector.c b/src/compiler/nir/nir_lower_io_to_vector.c new file mode 100644 index 00000000000..d979962373d --- /dev/null +++ b/src/compiler/nir/nir_lower_io_to_vector.c @@ -0,0 +1,387 @@ +/* + * Copyright © 2019 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "nir.h" +#include "nir_builder.h" +#include "nir_deref.h" + +/** @file nir_lower_io_to_vector.c + * + * Merges compatible input/output variables residing in different components + * of the same location. It's expected that further passes such as + * nir_lower_io_to_temporaries will combine loads and stores of the merged + * variables, producing vector nir_load_input/nir_store_output instructions + * when all is said and done. + */ + +static const struct glsl_type * +resize_array_vec_type(const struct glsl_type *type, unsigned num_components) +{ + if (glsl_type_is_array(type)) { + const struct glsl_type *arr_elem = + resize_array_vec_type(glsl_get_array_element(type), num_components); + return glsl_array_type(arr_elem, glsl_get_length(type), 0); + } else { + assert(glsl_type_is_vector_or_scalar(type)); + return glsl_vector_type(glsl_get_base_type(type), num_components); + } +} + +static bool +variable_can_rewrite(const nir_variable *var) +{ + /* Only touch user defined varyings as these are the only ones we split */ + if (var->data.location < VARYING_SLOT_VAR0) + return false; + + /* Skip complex types we don't split in the first place */ + if (!glsl_type_is_vector_or_scalar(glsl_without_array(var->type))) + return false; + + /* TODO: add 64/16bit support ? */ + if (glsl_get_bit_size(glsl_without_array(var->type)) != 32) + return false; + + return true; +} + +static bool +variables_can_merge(nir_shader *shader, + const nir_variable *a, const nir_variable *b) +{ + const struct glsl_type *a_type_tail = a->type; + const struct glsl_type *b_type_tail = b->type; + + /* They must have the same array structure */ + while (glsl_type_is_array(a_type_tail)) { + if (!glsl_type_is_array(b_type_tail)) + return false; + + if (glsl_get_length(a_type_tail) != glsl_get_length(b_type_tail)) + return false; + + a_type_tail = glsl_get_array_element(a_type_tail); + b_type_tail = glsl_get_array_element(b_type_tail); + } + + if (!glsl_type_is_vector_or_scalar(a_type_tail) || + !glsl_type_is_vector_or_scalar(b_type_tail)) + return false; + + if (glsl_get_base_type(a->type) != glsl_get_base_type(b->type)) + return false; + + assert(a->data.mode == b->data.mode); + if (shader->info.stage == MESA_SHADER_FRAGMENT && + a->data.mode == nir_var_shader_in && + a->data.interpolation != b->data.interpolation) + return false; + + return true; +} + +static bool +create_new_io_vars(nir_shader *shader, struct exec_list *io_list, + nir_variable *old_vars[MAX_VARYINGS_INCL_PATCH][4], + nir_variable *new_vars[MAX_VARYINGS_INCL_PATCH][4]) +{ + if (exec_list_is_empty(io_list)) + return false; + + nir_foreach_variable(var, io_list) { + if (variable_can_rewrite(var)) { + unsigned loc = var->data.location - VARYING_SLOT_VAR0; + unsigned frac = var->data.location_frac; + old_vars[loc][frac] = var; + } + } + + bool merged_any_vars = false; + + /* We don't handle combining vars of different type e.g. different array + * lengths. + */ + for (unsigned loc = 0; loc < MAX_VARYINGS_INCL_PATCH; loc++) { + unsigned frac = 0; + while (frac < 4) { + nir_variable *first_var = old_vars[loc][frac]; + if (!first_var) { + frac++; + continue; + } + + int first = frac; + bool found_merge = false; + + while (frac < 4) { + nir_variable *var = old_vars[loc][frac]; + if (!var) + break; + + if (var != first_var) { + if (!variables_can_merge(shader, first_var, var)) + break; + + found_merge = true; + } + + const unsigned num_components = + glsl_get_components(glsl_without_array(var->type)); + + /* We had better not have any overlapping vars */ + for (unsigned i = 1; i < num_components; i++) + assert(old_vars[loc][frac + i] == NULL); + + frac += num_components; + } + + if (!found_merge) + continue; + + merged_any_vars = true; + + nir_variable *var = nir_variable_clone(old_vars[loc][first], shader); + var->data.location_frac = first; + var->type = resize_array_vec_type(var->type, frac - first); + + nir_shader_add_variable(shader, var); + for (unsigned i = first; i < frac; i++) + new_vars[loc][i] = var; + } + } + + return merged_any_vars; +} + +static nir_deref_instr * +build_array_deref_of_new_var(nir_builder *b, nir_variable *new_var, + nir_deref_instr *leader) +{ + if (leader->deref_type == nir_deref_type_var) + return nir_build_deref_var(b, new_var); + + nir_deref_instr *parent = + build_array_deref_of_new_var(b, new_var, nir_deref_instr_parent(leader)); + + return nir_build_deref_follower(b, parent, leader); +} + +static bool +nir_lower_io_to_vector_impl(nir_function_impl *impl, nir_variable_mode modes) +{ + assert(!(modes & ~(nir_var_shader_in | nir_var_shader_out))); + + nir_builder b; + nir_builder_init(&b, impl); + + nir_metadata_require(impl, nir_metadata_dominance); + + nir_shader *shader = impl->function->shader; + nir_variable *old_inputs[MAX_VARYINGS_INCL_PATCH][4] = {0}; + nir_variable *new_inputs[MAX_VARYINGS_INCL_PATCH][4] = {0}; + nir_variable *old_outputs[MAX_VARYINGS_INCL_PATCH][4] = {0}; + nir_variable *new_outputs[MAX_VARYINGS_INCL_PATCH][4] = {0}; + + if (modes & nir_var_shader_in) { + /* Vertex shaders support overlapping inputs. We don't do those */ + assert(b.shader->info.stage != MESA_SHADER_VERTEX); + + /* If we don't actually merge any variables, remove that bit from modes + * so we don't bother doing extra non-work. + */ + if (!create_new_io_vars(shader, &shader->inputs, + old_inputs, new_inputs)) + modes &= ~nir_var_shader_in; + } + + if (modes & nir_var_shader_out) { + /* Fragment shader outputs are always vec4. You shouldn't have + * scalarized them and it doesn't make sense to vectorize them. + */ + assert(b.shader->info.stage != MESA_SHADER_FRAGMENT); + + /* If we don't actually merge any variables, remove that bit from modes + * so we don't bother doing extra non-work. + */ + if (!create_new_io_vars(shader, &shader->outputs, + old_outputs, new_outputs)) + modes &= ~nir_var_shader_out; + } + + if (!modes) + return false; + + bool progress = false; + + /* Actually lower all the IO load/store intrinsics. Load instructions are + * lowered to a vector load and an ALU instruction to grab the channels we + * want. Outputs are lowered to a write-masked store of the vector output. + * For non-TCS outputs, we then run nir_lower_io_to_temporaries at the end + * to clean up the partial writes. + */ + nir_foreach_block(block, impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + + switch (intrin->intrinsic) { + case nir_intrinsic_load_deref: + case nir_intrinsic_interp_deref_at_centroid: + case nir_intrinsic_interp_deref_at_sample: + case nir_intrinsic_interp_deref_at_offset: { + nir_deref_instr *old_deref = nir_src_as_deref(intrin->src[0]); + if (!(old_deref->mode & modes)) + break; + + if (old_deref->mode == nir_var_shader_out) + assert(b.shader->info.stage == MESA_SHADER_TESS_CTRL); + + nir_variable *old_var = nir_deref_instr_get_variable(old_deref); + if (old_var->data.location < VARYING_SLOT_VAR0) + break; + + const unsigned loc = old_var->data.location - VARYING_SLOT_VAR0; + const unsigned old_frac = old_var->data.location_frac; + nir_variable *new_var = old_deref->mode == nir_var_shader_in ? + new_inputs[loc][old_frac] : + new_outputs[loc][old_frac]; + if (!new_var) + break; + + assert(new_var->data.location == VARYING_SLOT_VAR0 + loc); + const unsigned new_frac = new_var->data.location_frac; + + nir_component_mask_t vec4_comp_mask = + ((1 << intrin->num_components) - 1) << old_frac; + + b.cursor = nir_before_instr(&intrin->instr); + + /* Rewrite the load to use the new variable and only select a + * portion of the result. + */ + nir_deref_instr *new_deref = + build_array_deref_of_new_var(&b, new_var, old_deref); + assert(glsl_type_is_vector(new_deref->type)); + nir_instr_rewrite_src(&intrin->instr, &intrin->src[0], + nir_src_for_ssa(&new_deref->dest.ssa)); + + intrin->num_components = + glsl_get_components(new_deref->type); + intrin->dest.ssa.num_components = intrin->num_components; + + b.cursor = nir_after_instr(&intrin->instr); + + nir_ssa_def *new_vec = nir_channels(&b, &intrin->dest.ssa, + vec4_comp_mask >> new_frac); + nir_ssa_def_rewrite_uses_after(&intrin->dest.ssa, + nir_src_for_ssa(new_vec), + new_vec->parent_instr); + + progress = true; + break; + } + + case nir_intrinsic_store_deref: { + nir_deref_instr *old_deref = nir_src_as_deref(intrin->src[0]); + if (old_deref->mode != nir_var_shader_out) + break; + + nir_variable *old_var = nir_deref_instr_get_variable(old_deref); + if (old_var->data.location < VARYING_SLOT_VAR0) + break; + + const unsigned loc = old_var->data.location - VARYING_SLOT_VAR0; + const unsigned old_frac = old_var->data.location_frac; + nir_variable *new_var = new_outputs[loc][old_frac]; + if (!new_var) + break; + + assert(new_var->data.location == VARYING_SLOT_VAR0 + loc); + const unsigned new_frac = new_var->data.location_frac; + + b.cursor = nir_before_instr(&intrin->instr); + + /* Rewrite the store to be a masked store to the new variable */ + nir_deref_instr *new_deref = + build_array_deref_of_new_var(&b, new_var, old_deref); + assert(glsl_type_is_vector(new_deref->type)); + nir_instr_rewrite_src(&intrin->instr, &intrin->src[0], + nir_src_for_ssa(&new_deref->dest.ssa)); + + intrin->num_components = + glsl_get_components(new_deref->type); + + nir_component_mask_t old_wrmask = nir_intrinsic_write_mask(intrin); + + assert(intrin->src[1].is_ssa); + nir_ssa_def *old_value = intrin->src[1].ssa; + nir_ssa_def *comps[4]; + for (unsigned c = 0; c < intrin->num_components; c++) { + if (new_frac + c >= old_frac && + (old_wrmask & 1 << (new_frac + c - old_frac))) { + comps[c] = nir_channel(&b, old_value, + new_frac + c - old_frac); + } else { + comps[c] = nir_ssa_undef(&b, old_value->num_components, + old_value->bit_size); + } + } + nir_ssa_def *new_value = nir_vec(&b, comps, intrin->num_components); + nir_instr_rewrite_src(&intrin->instr, &intrin->src[1], + nir_src_for_ssa(new_value)); + + nir_intrinsic_set_write_mask(intrin, + old_wrmask << (old_frac - new_frac)); + + progress = true; + break; + } + + default: + break; + } + } + } + + if (progress) { + nir_metadata_preserve(impl, nir_metadata_block_index | + nir_metadata_dominance); + } + + return progress; +} + +bool +nir_lower_io_to_vector(nir_shader *shader, nir_variable_mode modes) +{ + bool progress = false; + + nir_foreach_function(function, shader) { + if (function->impl) + progress |= nir_lower_io_to_vector_impl(function->impl, modes); + } + + return progress; +} diff --git a/src/compiler/nir/nir_lower_tex.c b/src/compiler/nir/nir_lower_tex.c index a618b86b34c..11afffe3dee 100644 --- a/src/compiler/nir/nir_lower_tex.c +++ b/src/compiler/nir/nir_lower_tex.c @@ -306,7 +306,8 @@ lower_implicit_lod(nir_builder *b, nir_tex_instr *tex) } static nir_ssa_def * -sample_plane(nir_builder *b, nir_tex_instr *tex, int plane) +sample_plane(nir_builder *b, nir_tex_instr *tex, int plane, + const nir_lower_tex_options *options) { assert(tex->dest.is_ssa); assert(nir_tex_instr_dest_size(tex) == 4); @@ -334,6 +335,11 @@ sample_plane(nir_builder *b, nir_tex_instr *tex, int plane) nir_builder_instr_insert(b, &plane_tex->instr); + /* If scaling_factor is set, return a scaled value. */ + if (options->scale_factors[tex->texture_index]) + return nir_fmul_imm(b, &plane_tex->dest.ssa, + options->scale_factors[tex->texture_index]); + return &plane_tex->dest.ssa; } @@ -366,12 +372,13 @@ convert_yuv_to_rgb(nir_builder *b, nir_tex_instr *tex, } static void -lower_y_uv_external(nir_builder *b, nir_tex_instr *tex) +lower_y_uv_external(nir_builder *b, nir_tex_instr *tex, + const nir_lower_tex_options *options) { b->cursor = nir_after_instr(&tex->instr); - nir_ssa_def *y = sample_plane(b, tex, 0); - nir_ssa_def *uv = sample_plane(b, tex, 1); + nir_ssa_def *y = sample_plane(b, tex, 0, options); + nir_ssa_def *uv = sample_plane(b, tex, 1, options); convert_yuv_to_rgb(b, tex, nir_channel(b, y, 0), @@ -381,13 +388,14 @@ lower_y_uv_external(nir_builder *b, nir_tex_instr *tex) } static void -lower_y_u_v_external(nir_builder *b, nir_tex_instr *tex) +lower_y_u_v_external(nir_builder *b, nir_tex_instr *tex, + const nir_lower_tex_options *options) { b->cursor = nir_after_instr(&tex->instr); - nir_ssa_def *y = sample_plane(b, tex, 0); - nir_ssa_def *u = sample_plane(b, tex, 1); - nir_ssa_def *v = sample_plane(b, tex, 2); + nir_ssa_def *y = sample_plane(b, tex, 0, options); + nir_ssa_def *u = sample_plane(b, tex, 1, options); + nir_ssa_def *v = sample_plane(b, tex, 2, options); convert_yuv_to_rgb(b, tex, nir_channel(b, y, 0), @@ -397,12 +405,13 @@ lower_y_u_v_external(nir_builder *b, nir_tex_instr *tex) } static void -lower_yx_xuxv_external(nir_builder *b, nir_tex_instr *tex) +lower_yx_xuxv_external(nir_builder *b, nir_tex_instr *tex, + const nir_lower_tex_options *options) { b->cursor = nir_after_instr(&tex->instr); - nir_ssa_def *y = sample_plane(b, tex, 0); - nir_ssa_def *xuxv = sample_plane(b, tex, 1); + nir_ssa_def *y = sample_plane(b, tex, 0, options); + nir_ssa_def *xuxv = sample_plane(b, tex, 1, options); convert_yuv_to_rgb(b, tex, nir_channel(b, y, 0), @@ -412,12 +421,13 @@ lower_yx_xuxv_external(nir_builder *b, nir_tex_instr *tex) } static void -lower_xy_uxvx_external(nir_builder *b, nir_tex_instr *tex) +lower_xy_uxvx_external(nir_builder *b, nir_tex_instr *tex, + const nir_lower_tex_options *options) { b->cursor = nir_after_instr(&tex->instr); - nir_ssa_def *y = sample_plane(b, tex, 0); - nir_ssa_def *uxvx = sample_plane(b, tex, 1); + nir_ssa_def *y = sample_plane(b, tex, 0, options); + nir_ssa_def *uxvx = sample_plane(b, tex, 1, options); convert_yuv_to_rgb(b, tex, nir_channel(b, y, 1), @@ -427,11 +437,12 @@ lower_xy_uxvx_external(nir_builder *b, nir_tex_instr *tex) } static void -lower_ayuv_external(nir_builder *b, nir_tex_instr *tex) +lower_ayuv_external(nir_builder *b, nir_tex_instr *tex, + const nir_lower_tex_options *options) { b->cursor = nir_after_instr(&tex->instr); - nir_ssa_def *ayuv = sample_plane(b, tex, 0); + nir_ssa_def *ayuv = sample_plane(b, tex, 0, options); convert_yuv_to_rgb(b, tex, nir_channel(b, ayuv, 2), @@ -879,6 +890,25 @@ lower_tex_packing(nir_builder *b, nir_tex_instr *tex, color->parent_instr); } +static bool +sampler_index_lt(nir_tex_instr *tex, unsigned max) +{ + assert(nir_tex_instr_src_index(tex, nir_tex_src_sampler_deref) == -1); + + unsigned sampler_index = tex->sampler_index; + + int sampler_offset_idx = + nir_tex_instr_src_index(tex, nir_tex_src_sampler_offset); + if (sampler_offset_idx >= 0) { + if (!nir_src_is_const(tex->src[sampler_offset_idx].src)) + return false; + + sampler_index += nir_src_as_uint(tex->src[sampler_offset_idx].src); + } + + return sampler_index < max; +} + static bool nir_lower_tex_block(nir_block *block, nir_builder *b, const nir_lower_tex_options *options) @@ -923,27 +953,27 @@ nir_lower_tex_block(nir_block *block, nir_builder *b, } if ((1 << tex->texture_index) & options->lower_y_uv_external) { - lower_y_uv_external(b, tex); + lower_y_uv_external(b, tex, options); progress = true; } if ((1 << tex->texture_index) & options->lower_y_u_v_external) { - lower_y_u_v_external(b, tex); + lower_y_u_v_external(b, tex, options); progress = true; } if ((1 << tex->texture_index) & options->lower_yx_xuxv_external) { - lower_yx_xuxv_external(b, tex); + lower_yx_xuxv_external(b, tex, options); progress = true; } if ((1 << tex->texture_index) & options->lower_xy_uxvx_external) { - lower_xy_uxvx_external(b, tex); + lower_xy_uxvx_external(b, tex, options); progress = true; } if ((1 << tex->texture_index) & options->lower_ayuv_external) { - lower_ayuv_external(b, tex); + lower_ayuv_external(b, tex, options); progress = true; } @@ -995,6 +1025,8 @@ nir_lower_tex_block(nir_block *block, nir_builder *b, (options->lower_txd_shadow && tex->is_shadow) || (options->lower_txd_shadow_clamp && tex->is_shadow && has_min_lod) || (options->lower_txd_offset_clamp && has_offset && has_min_lod) || + (options->lower_txd_clamp_if_sampler_index_not_lt_16 && + has_min_lod && !sampler_index_lt(tex, 16)) || (options->lower_txd_cube_map && tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE) || (options->lower_txd_3d && diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py index d32005846a6..f52e623ef0f 100644 --- a/src/compiler/nir/nir_opcodes.py +++ b/src/compiler/nir/nir_opcodes.py @@ -404,12 +404,21 @@ def unpack_4x8(fmt): float absX = fabs(src0.x); float absY = fabs(src0.y); float absZ = fabs(src0.z); -if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.y; dst.y = -src0.z; } -if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = -src0.y; dst.y = src0.z; } -if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.z; dst.y = src0.x; } -if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = -src0.z; dst.y = src0.x; } -if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.y; dst.y = src0.x; } -if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.y; dst.y = -src0.x; } + +float ma = 0.0; +if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; } +if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; } +if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; } + +if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; } +if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; } +if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; } +if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; } +if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; } +if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; } + +dst.x = dst.x / ma + 0.5; +dst.y = dst.y / ma + 0.5; """) unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """ diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py index 75a3d2ad238..53c842b9ef9 100644 --- a/src/compiler/nir/nir_opt_algebraic.py +++ b/src/compiler/nir/nir_opt_algebraic.py @@ -618,11 +618,11 @@ # Reassociate constants in add/mul chains so they can be folded together. # For now, we mostly only handle cases where the constants are separated by # a single non-constant. We could do better eventually. - (('~fmul', '#a', ('fmul', b, '#c')), ('fmul', ('fmul', a, c), b)), - (('imul', '#a', ('imul', b, '#c')), ('imul', ('imul', a, c), b)), - (('~fadd', '#a', ('fadd', b, '#c')), ('fadd', ('fadd', a, c), b)), - (('~fadd', '#a', ('fneg', ('fadd', b, '#c'))), ('fadd', ('fadd', a, ('fneg', c)), ('fneg', b))), - (('iadd', '#a', ('iadd', b, '#c')), ('iadd', ('iadd', a, c), b)), + (('~fmul', '#a', ('fmul', 'b(is_not_const)', '#c')), ('fmul', ('fmul', a, c), b)), + (('imul', '#a', ('imul', 'b(is_not_const)', '#c')), ('imul', ('imul', a, c), b)), + (('~fadd', '#a', ('fadd', 'b(is_not_const)', '#c')), ('fadd', ('fadd', a, c), b)), + (('~fadd', '#a', ('fneg', ('fadd', 'b(is_not_const)', '#c'))), ('fadd', ('fadd', a, ('fneg', c)), ('fneg', b))), + (('iadd', '#a', ('iadd', 'b(is_not_const)', '#c')), ('iadd', ('iadd', a, c), b)), # By definition... (('bcsel', ('ige', ('find_lsb', a), 0), ('find_lsb', a), -1), ('find_lsb', a)), @@ -929,9 +929,6 @@ def bitfield_reverse(u): (('fdot4', a, b), ('fdot_replicated4', a, b), 'options->fdot_replicates'), (('fdph', a, b), ('fdph_replicated', a, b), 'options->fdot_replicates'), - (('b2f(is_used_more_than_once)', ('inot', 'a@1')), ('bcsel', a, 0.0, 1.0)), - (('fneg(is_used_more_than_once)', ('b2f', ('inot', 'a@1'))), ('bcsel', a, -0.0, -1.0)), - # we do these late so that we don't get in the way of creating ffmas (('fmin', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmin', a, b))), (('fmax', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmax', a, b))), diff --git a/src/compiler/nir/nir_opt_copy_prop_vars.c b/src/compiler/nir/nir_opt_copy_prop_vars.c index 392fef407cf..a71cce19a80 100644 --- a/src/compiler/nir/nir_opt_copy_prop_vars.c +++ b/src/compiler/nir/nir_opt_copy_prop_vars.c @@ -653,7 +653,7 @@ copy_prop_vars_block(struct copy_prop_var_state *state, struct copy_entry *src_entry = lookup_entry_for_deref(copies, src, nir_derefs_a_contains_b_bit); - struct value value; + struct value value = {0}; if (try_load_from_entry(state, src_entry, b, intrin, src, &value)) { if (value.is_ssa) { /* lookup_load has already ensured that we get a single SSA diff --git a/src/compiler/nir/nir_opt_idiv_const.c b/src/compiler/nir/nir_opt_idiv_const.c index 7fa739161ba..3e4b7a42d42 100644 --- a/src/compiler/nir/nir_opt_idiv_const.c +++ b/src/compiler/nir/nir_opt_idiv_const.c @@ -65,15 +65,17 @@ build_umod(nir_builder *b, nir_ssa_def *n, uint64_t d) static nir_ssa_def * build_idiv(nir_builder *b, nir_ssa_def *n, int64_t d) { + uint64_t abs_d = d < 0 ? -d : d; + if (d == 0) { return nir_imm_intN_t(b, 0, n->bit_size); } else if (d == 1) { return n; } else if (d == -1) { return nir_ineg(b, n); - } else if (util_is_power_of_two_or_zero64(d)) { - uint64_t abs_d = d < 0 ? -d : d; - nir_ssa_def *uq = nir_ishr(b, n, nir_imm_int(b, util_logbase2_64(abs_d))); + } else if (util_is_power_of_two_or_zero64(abs_d)) { + nir_ssa_def *uq = nir_ushr(b, nir_iabs(b, n), + nir_imm_int(b, util_logbase2_64(abs_d))); nir_ssa_def *n_neg = nir_ilt(b, n, nir_imm_intN_t(b, 0, n->bit_size)); nir_ssa_def *neg = d < 0 ? nir_inot(b, n_neg) : n_neg; return nir_bcsel(b, neg, nir_ineg(b, uq), uq); diff --git a/src/compiler/nir/nir_opt_if.c b/src/compiler/nir/nir_opt_if.c index c2f945d4d59..ba94807bb20 100644 --- a/src/compiler/nir/nir_opt_if.c +++ b/src/compiler/nir/nir_opt_if.c @@ -313,6 +313,13 @@ opt_if_loop_last_continue(nir_loop *loop) if (!then_ends_in_continue && !else_ends_in_continue) return false; + /* if the block after the if/else is empty we bail, otherwise we might end + * up looping forever + */ + if (&nif->cf_node == nir_cf_node_prev(&last_block->cf_node) && + exec_list_is_empty(&last_block->instr_list)) + return false; + /* Move the last block of the loop inside the last if-statement */ nir_cf_list tmp; nir_cf_extract(&tmp, nir_after_cf_node(if_node), diff --git a/src/compiler/nir/nir_opt_peephole_select.c b/src/compiler/nir/nir_opt_peephole_select.c index 1deb02a380e..32d337f99dd 100644 --- a/src/compiler/nir/nir_opt_peephole_select.c +++ b/src/compiler/nir/nir_opt_peephole_select.c @@ -59,8 +59,7 @@ static bool block_check_for_allowed_instrs(nir_block *block, unsigned *count, - bool alu_ok, bool indirect_load_ok, - bool expensive_alu_ok) + bool alu_ok, bool indirect_load_ok) { nir_foreach_instr(instr, block) { switch (instr->type) { @@ -118,25 +117,6 @@ block_check_for_allowed_instrs(nir_block *block, unsigned *count, case nir_op_vec3: case nir_op_vec4: break; - - case nir_op_fcos: - case nir_op_fdiv: - case nir_op_fexp2: - case nir_op_flog2: - case nir_op_fmod: - case nir_op_fpow: - case nir_op_frcp: - case nir_op_frem: - case nir_op_frsq: - case nir_op_fsin: - case nir_op_idiv: - case nir_op_irem: - case nir_op_udiv: - if (!alu_ok || !expensive_alu_ok) - return false; - - break; - default: if (!alu_ok) { /* It must be a move-like operation. */ @@ -180,8 +160,7 @@ block_check_for_allowed_instrs(nir_block *block, unsigned *count, static bool nir_opt_peephole_select_block(nir_block *block, nir_shader *shader, - unsigned limit, bool indirect_load_ok, - bool expensive_alu_ok) + unsigned limit, bool indirect_load_ok) { if (nir_cf_node_is_first(&block->cf_node)) return false; @@ -202,9 +181,9 @@ nir_opt_peephole_select_block(nir_block *block, nir_shader *shader, /* ... and those blocks must only contain "allowed" instructions. */ unsigned count = 0; if (!block_check_for_allowed_instrs(then_block, &count, limit != 0, - indirect_load_ok, expensive_alu_ok) || + indirect_load_ok) || !block_check_for_allowed_instrs(else_block, &count, limit != 0, - indirect_load_ok, expensive_alu_ok)) + indirect_load_ok)) return false; if (count > limit) @@ -271,15 +250,14 @@ nir_opt_peephole_select_block(nir_block *block, nir_shader *shader, static bool nir_opt_peephole_select_impl(nir_function_impl *impl, unsigned limit, - bool indirect_load_ok, bool expensive_alu_ok) + bool indirect_load_ok) { nir_shader *shader = impl->function->shader; bool progress = false; nir_foreach_block_safe(block, impl) { progress |= nir_opt_peephole_select_block(block, shader, limit, - indirect_load_ok, - expensive_alu_ok); + indirect_load_ok); } if (progress) { @@ -295,15 +273,14 @@ nir_opt_peephole_select_impl(nir_function_impl *impl, unsigned limit, bool nir_opt_peephole_select(nir_shader *shader, unsigned limit, - bool indirect_load_ok, bool expensive_alu_ok) + bool indirect_load_ok) { bool progress = false; nir_foreach_function(function, shader) { if (function->impl) progress |= nir_opt_peephole_select_impl(function->impl, limit, - indirect_load_ok, - expensive_alu_ok); + indirect_load_ok); } return progress; diff --git a/src/compiler/nir/nir_print.c b/src/compiler/nir/nir_print.c index 80bc25fde9a..422249677b7 100644 --- a/src/compiler/nir/nir_print.c +++ b/src/compiler/nir/nir_print.c @@ -812,8 +812,8 @@ print_intrinsic_instr(nir_intrinsic_instr *instr, print_state *state) assert(dim < ARRAY_SIZE(dim_name) && dim_name[dim]); fprintf(fp, " image_dim=%s", dim_name[dim]); } else if (idx == NIR_INTRINSIC_IMAGE_ARRAY) { - bool array = nir_intrinsic_image_dim(instr); - fprintf(fp, " image_dim=%s", array ? "true" : "false"); + bool array = nir_intrinsic_image_array(instr); + fprintf(fp, " image_array=%s", array ? "true" : "false"); } else if (idx == NIR_INTRINSIC_DESC_TYPE) { VkDescriptorType desc_type = nir_intrinsic_desc_type(instr); fprintf(fp, " desc_type=%s", vulkan_descriptor_type_name(desc_type)); diff --git a/src/compiler/nir/nir_repair_ssa.c b/src/compiler/nir/nir_repair_ssa.c index b4d22d91c27..f182818374d 100644 --- a/src/compiler/nir/nir_repair_ssa.c +++ b/src/compiler/nir/nir_repair_ssa.c @@ -77,6 +77,15 @@ repair_ssa_def(nir_ssa_def *def, void *void_state) } } + nir_foreach_if_use(src, def) { + nir_block *block_before_if = + nir_cf_node_as_block(nir_cf_node_prev(&src->parent_if->cf_node)); + if (!nir_block_dominates(def->parent_instr->block, block_before_if)) { + is_valid = false; + break; + } + } + if (is_valid) return true; @@ -98,6 +107,15 @@ repair_ssa_def(nir_ssa_def *def, void *void_state) } } + nir_foreach_if_use_safe(src, def) { + nir_block *block_before_if = + nir_cf_node_as_block(nir_cf_node_prev(&src->parent_if->cf_node)); + if (!nir_block_dominates(def->parent_instr->block, block_before_if)) { + nir_if_rewrite_condition(src->parent_if, nir_src_for_ssa( + nir_phi_builder_value_get_block_def(val, block_before_if))); + } + } + return true; } diff --git a/src/compiler/nir/nir_search_helpers.h b/src/compiler/nir/nir_search_helpers.h index 89f1cba5c52..1fb450752ad 100644 --- a/src/compiler/nir/nir_search_helpers.h +++ b/src/compiler/nir/nir_search_helpers.h @@ -116,22 +116,6 @@ is_not_const(nir_alu_instr *instr, unsigned src, UNUSED unsigned num_components, return !nir_src_is_const(instr->src[src].src); } -static inline bool -is_used_more_than_once(nir_alu_instr *instr) -{ - bool zero_if_use = list_empty(&instr->dest.dest.ssa.if_uses); - bool zero_use = list_empty(&instr->dest.dest.ssa.uses); - - if (zero_use && zero_if_use) - return false; - else if (zero_use && list_is_singular(&instr->dest.dest.ssa.if_uses)) - return false; - else if (zero_if_use && list_is_singular(&instr->dest.dest.ssa.uses)) - return false; - - return true; -} - static inline bool is_used_once(nir_alu_instr *instr) { diff --git a/src/compiler/nir_types.cpp b/src/compiler/nir_types.cpp index b4bde5470c0..3a406e99769 100644 --- a/src/compiler/nir_types.cpp +++ b/src/compiler/nir_types.cpp @@ -326,6 +326,12 @@ glsl_type_is_integer(const struct glsl_type *type) return type->is_integer(); } +bool +glsl_type_contains_64bit(const struct glsl_type *type) +{ + return type->contains_64bit(); +} + const glsl_type * glsl_void_type(void) { diff --git a/src/compiler/nir_types.h b/src/compiler/nir_types.h index 40cddf76374..eb5cdf0a089 100644 --- a/src/compiler/nir_types.h +++ b/src/compiler/nir_types.h @@ -97,37 +97,7 @@ unsigned glsl_atomic_size(const struct glsl_type *type); static inline unsigned glsl_get_bit_size(const struct glsl_type *type) { - switch (glsl_get_base_type(type)) { - case GLSL_TYPE_BOOL: - return 1; - - case GLSL_TYPE_INT: - case GLSL_TYPE_UINT: - case GLSL_TYPE_FLOAT: /* TODO handle mediump */ - case GLSL_TYPE_SUBROUTINE: - return 32; - - case GLSL_TYPE_FLOAT16: - case GLSL_TYPE_UINT16: - case GLSL_TYPE_INT16: - return 16; - - case GLSL_TYPE_UINT8: - case GLSL_TYPE_INT8: - return 8; - - case GLSL_TYPE_DOUBLE: - case GLSL_TYPE_INT64: - case GLSL_TYPE_UINT64: - case GLSL_TYPE_IMAGE: - case GLSL_TYPE_SAMPLER: - return 64; - - default: - unreachable("unknown base type"); - } - - return 0; + return glsl_base_type_get_bit_size(glsl_get_base_type(type)); } bool glsl_type_is_16bit(const struct glsl_type *type); @@ -149,6 +119,7 @@ bool glsl_type_is_dual_slot(const struct glsl_type *type); bool glsl_type_is_numeric(const struct glsl_type *type); bool glsl_type_is_boolean(const struct glsl_type *type); bool glsl_type_is_integer(const struct glsl_type *type); +bool glsl_type_contains_64bit(const struct glsl_type *type); bool glsl_sampler_type_is_shadow(const struct glsl_type *type); bool glsl_sampler_type_is_array(const struct glsl_type *type); bool glsl_contains_atomic(const struct glsl_type *type); diff --git a/src/compiler/shader_info.h b/src/compiler/shader_info.h index c3dbe764961..e82f465b256 100644 --- a/src/compiler/shader_info.h +++ b/src/compiler/shader_info.h @@ -36,6 +36,8 @@ struct spirv_supported_capabilities { bool address; bool atomic_storage; bool descriptor_array_dynamic_indexing; + bool descriptor_array_non_uniform_indexing; + bool descriptor_indexing; bool device_group; bool draw_parameters; bool float64; diff --git a/src/compiler/spirv/spirv_to_nir.c b/src/compiler/spirv/spirv_to_nir.c index 9bfe5805919..f76cac88f18 100644 --- a/src/compiler/spirv/spirv_to_nir.c +++ b/src/compiler/spirv/spirv_to_nir.c @@ -494,6 +494,7 @@ vtn_handle_decoration(struct vtn_builder *b, SpvOp opcode, break; case SpvOpDecorate: + case SpvOpDecorateId: case SpvOpMemberDecorate: case SpvOpDecorateStringGOOGLE: case SpvOpMemberDecorateStringGOOGLE: @@ -503,6 +504,7 @@ vtn_handle_decoration(struct vtn_builder *b, SpvOp opcode, struct vtn_decoration *dec = rzalloc(b, struct vtn_decoration); switch (opcode) { case SpvOpDecorate: + case SpvOpDecorateId: case SpvOpDecorateStringGOOGLE: dec->scope = VTN_DEC_DECORATION; break; @@ -2155,6 +2157,7 @@ vtn_handle_texture(struct vtn_builder *b, SpvOp opcode, case nir_texop_txl: case nir_texop_txd: case nir_texop_tg4: + case nir_texop_lod: /* These operations require a sampler */ p->src = nir_src_for_ssa(&sampler->dest.ssa); p->src_type = nir_tex_src_sampler_deref; @@ -2163,7 +2166,6 @@ vtn_handle_texture(struct vtn_builder *b, SpvOp opcode, case nir_texop_txf: case nir_texop_txf_ms: case nir_texop_txs: - case nir_texop_lod: case nir_texop_query_levels: case nir_texop_texture_samples: case nir_texop_samples_identical: @@ -3045,12 +3047,7 @@ nir_ssa_def * vtn_vector_extract_dynamic(struct vtn_builder *b, nir_ssa_def *src, nir_ssa_def *index) { - nir_ssa_def *dest = vtn_vector_extract(b, src, 0); - for (unsigned i = 1; i < src->num_components; i++) - dest = nir_bcsel(&b->nb, nir_ieq_imm(&b->nb, index, i), - vtn_vector_extract(b, src, i), dest); - - return dest; + return nir_vector_extract(&b->nb, src, nir_i2i(&b->nb, index, 32)); } nir_ssa_def * @@ -3595,6 +3592,7 @@ vtn_handle_preamble_instruction(struct vtn_builder *b, SpvOp opcode, case SpvCapabilityInt64Atomics: spv_check_supported(int64_atomics, cap); + break; case SpvCapabilityInt8: spv_check_supported(int8, cap); @@ -3703,12 +3701,26 @@ vtn_handle_preamble_instruction(struct vtn_builder *b, SpvOp opcode, spv_check_supported(storage_8bit, cap); break; + case SpvCapabilityShaderNonUniformEXT: + spv_check_supported(descriptor_indexing, cap); + break; + case SpvCapabilityInputAttachmentArrayDynamicIndexingEXT: case SpvCapabilityUniformTexelBufferArrayDynamicIndexingEXT: case SpvCapabilityStorageTexelBufferArrayDynamicIndexingEXT: spv_check_supported(descriptor_array_dynamic_indexing, cap); break; + case SpvCapabilityUniformBufferArrayNonUniformIndexingEXT: + case SpvCapabilitySampledImageArrayNonUniformIndexingEXT: + case SpvCapabilityStorageBufferArrayNonUniformIndexingEXT: + case SpvCapabilityStorageImageArrayNonUniformIndexingEXT: + case SpvCapabilityInputAttachmentArrayNonUniformIndexingEXT: + case SpvCapabilityUniformTexelBufferArrayNonUniformIndexingEXT: + case SpvCapabilityStorageTexelBufferArrayNonUniformIndexingEXT: + spv_check_supported(descriptor_array_non_uniform_indexing, cap); + break; + case SpvCapabilityRuntimeDescriptorArrayEXT: spv_check_supported(runtime_descriptor_array, cap); break; @@ -3764,6 +3776,7 @@ vtn_handle_preamble_instruction(struct vtn_builder *b, SpvOp opcode, case SpvOpExecutionMode: case SpvOpDecorationGroup: case SpvOpDecorate: + case SpvOpDecorateId: case SpvOpMemberDecorate: case SpvOpGroupDecorate: case SpvOpGroupMemberDecorate: @@ -3951,6 +3964,7 @@ vtn_handle_variable_or_type_instruction(struct vtn_builder *b, SpvOp opcode, case SpvOpMemberName: case SpvOpDecorationGroup: case SpvOpDecorate: + case SpvOpDecorateId: case SpvOpMemberDecorate: case SpvOpGroupDecorate: case SpvOpGroupMemberDecorate: @@ -4480,20 +4494,35 @@ spirv_to_nir(const uint32_t *words, size_t word_count, } } while (progress); + vtn_assert(b->entry_point->value_type == vtn_value_type_function); + nir_function *entry_point = b->entry_point->func->impl->function; + vtn_assert(entry_point); + + entry_point->is_entrypoint = true; + + /* When multiple shader stages exist in the same SPIR-V module, we + * generate input and output variables for every stage, in the same + * NIR program. These dead variables can be invalid NIR. For example, + * TCS outputs must be per-vertex arrays (or decorated 'patch'), while + * VS output variables wouldn't be. + * + * To ensure we have valid NIR, we eliminate any dead inputs and outputs + * right away. In order to do so, we must lower any constant initializers + * on outputs so nir_remove_dead_variables sees that they're written to. + */ + nir_lower_constant_initializers(b->shader, nir_var_shader_out); + nir_remove_dead_variables(b->shader, + nir_var_shader_in | nir_var_shader_out); + /* We sometimes generate bogus derefs that, while never used, give the * validator a bit of heartburn. Run dead code to get rid of them. */ nir_opt_dce(b->shader); - vtn_assert(b->entry_point->value_type == vtn_value_type_function); - nir_function *entry_point = b->entry_point->func->impl->function; - vtn_assert(entry_point); - /* Unparent the shader from the vtn_builder before we delete the builder */ ralloc_steal(NULL, b->shader); ralloc_free(b); - entry_point->is_entrypoint = true; return entry_point; } diff --git a/src/compiler/spirv/vtn_variables.c b/src/compiler/spirv/vtn_variables.c index ecdfd0c735f..fe5340ab8cf 100644 --- a/src/compiler/spirv/vtn_variables.c +++ b/src/compiler/spirv/vtn_variables.c @@ -1444,6 +1444,8 @@ apply_var_decoration(struct vtn_builder *b, switch (builtin) { case SpvBuiltInTessLevelOuter: case SpvBuiltInTessLevelInner: + case SpvBuiltInClipDistance: + case SpvBuiltInCullDistance: var_data->compact = true; break; case SpvBuiltInFragCoord: @@ -2442,9 +2444,17 @@ vtn_handle_variables(struct vtn_builder *b, SpvOp opcode, case SpvOpArrayLength: { struct vtn_pointer *ptr = vtn_value(b, w[3], vtn_value_type_pointer)->pointer; + const uint32_t field = w[4]; - const uint32_t offset = ptr->var->type->offsets[w[4]]; - const uint32_t stride = ptr->var->type->members[w[4]]->stride; + vtn_fail_if(ptr->type->base_type != vtn_base_type_struct, + "OpArrayLength must take a pointer to a structure type"); + vtn_fail_if(field != ptr->type->length - 1 || + ptr->type->members[field]->base_type != vtn_base_type_array, + "OpArrayLength must reference the last memeber of the " + "structure and that must be an array"); + + const uint32_t offset = ptr->type->offsets[field]; + const uint32_t stride = ptr->type->members[field]->stride; if (!ptr->block_index) { struct vtn_access_chain chain = { diff --git a/src/egl/Android.mk b/src/egl/Android.mk index 42b391e6d86..3c7f1366e34 100644 --- a/src/egl/Android.mk +++ b/src/egl/Android.mk @@ -59,11 +59,22 @@ LOCAL_SHARED_LIBRARIES := \ libcutils \ libsync +ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0) +LOCAL_C_INCLUDES += \ + frameworks/native/libs/nativewindow/include \ + frameworks/native/libs/arect/include +LOCAL_HEADER_LIBRARIES += libnativebase_headers +endif + ifeq ($(BOARD_USES_DRM_GRALLOC),true) LOCAL_CFLAGS += -DHAVE_DRM_GRALLOC LOCAL_SHARED_LIBRARIES += libgralloc_drm endif +ifeq ($(strip $(BOARD_USES_GRALLOC1)),true) +LOCAL_CFLAGS += -DHAVE_GRALLOC1 +endif + ifeq ($(filter $(MESA_ANDROID_MAJOR_VERSION), 4 5 6 7),) LOCAL_SHARED_LIBRARIES += libnativewindow endif @@ -81,6 +92,6 @@ endif LOCAL_MODULE := libGLES_mesa LOCAL_MODULE_RELATIVE_PATH := egl - +LOCAL_CFLAGS += -Wno-error include $(MESA_COMMON_MK) include $(BUILD_SHARED_LIBRARY) diff --git a/src/egl/drivers/dri2/egl_dri2.c b/src/egl/drivers/dri2/egl_dri2.c index c98b9a5d18a..ca26e34daa3 100644 --- a/src/egl/drivers/dri2/egl_dri2.c +++ b/src/egl/drivers/dri2/egl_dri2.c @@ -66,6 +66,20 @@ #include "util/u_vector.h" #include "mapi/glapi/glapi.h" +/* Additional definitions not yet in the drm_fourcc.h. + */ +#ifndef DRM_FORMAT_P010 +#define DRM_FORMAT_P010 fourcc_code('P', '0', '1', '0') /* 2x2 subsampled Cb:Cr plane 10 bits per channel */ +#endif + +#ifndef DRM_FORMAT_P012 +#define DRM_FORMAT_P012 fourcc_code('P', '0', '1', '2') /* 2x2 subsampled Cb:Cr plane 12 bits per channel */ +#endif + +#ifndef DRM_FORMAT_P016 +#define DRM_FORMAT_P016 fourcc_code('P', '0', '1', '6') /* 2x2 subsampled Cb:Cr plane 16 bits per channel */ +#endif + #define NUM_ATTRIBS 12 static void @@ -199,8 +213,10 @@ dri2_add_config(_EGLDisplay *disp, const __DRIconfig *dri_config, int id, bind_to_texture_rgb = 0; bind_to_texture_rgba = 0; - for (int i = 0; dri2_dpy->core->indexConfigAttrib(dri_config, i, &attrib, - &value); ++i) { + for (int i = 0; i < __DRI_ATTRIB_MAX; ++i) { + if (!dri2_dpy->core->indexConfigAttrib(dri_config, i, &attrib, &value)) + break; + switch (attrib) { case __DRI_ATTRIB_RENDER_TYPE: if (value & __DRI_ATTRIB_RGBA_BIT) @@ -2262,6 +2278,9 @@ dri2_num_fourcc_format_planes(EGLint format) case DRM_FORMAT_NV21: case DRM_FORMAT_NV16: case DRM_FORMAT_NV61: + case DRM_FORMAT_P010: + case DRM_FORMAT_P012: + case DRM_FORMAT_P016: return 2; case DRM_FORMAT_YUV410: diff --git a/src/egl/drivers/dri2/egl_dri2.h b/src/egl/drivers/dri2/egl_dri2.h index a9ddadf11b1..4e80deb2038 100644 --- a/src/egl/drivers/dri2/egl_dri2.h +++ b/src/egl/drivers/dri2/egl_dri2.h @@ -69,6 +69,10 @@ struct zwp_linux_dmabuf_v1; #include #endif /* HAVE_ANDROID_PLATFORM */ +#ifdef HAVE_GRALLOC1 +#include +#endif + #include "eglconfig.h" #include "eglcontext.h" #include "egldevice.h" @@ -238,7 +242,14 @@ struct dri2_egl_display #endif #ifdef HAVE_ANDROID_PLATFORM - const gralloc_module_t *gralloc; + const hw_module_t *gralloc; + uint16_t gralloc_version; +#ifdef HAVE_GRALLOC1 + gralloc1_device_t *gralloc1_dvc; + GRALLOC1_PFN_LOCK_FLEX pfn_lockflex; + GRALLOC1_PFN_GET_FORMAT pfn_getFormat; + GRALLOC1_PFN_UNLOCK pfn_unlock; +#endif #endif bool is_render_node; diff --git a/src/egl/drivers/dri2/platform_android.c b/src/egl/drivers/dri2/platform_android.c index 366a9ec14e9..a08723625fb 100644 --- a/src/egl/drivers/dri2/platform_android.c +++ b/src/egl/drivers/dri2/platform_android.c @@ -49,6 +49,8 @@ #define ALIGN(val, align) (((val) + (align) - 1) & ~((align) - 1)) +#define GRALLOC_DRM_GET_FORMAT 1 + struct droid_yuv_format { /* Lookup keys */ int native; /* HAL_PIXEL_FORMAT_ */ @@ -59,14 +61,26 @@ struct droid_yuv_format { int fourcc; /* __DRI_IMAGE_FOURCC_ */ }; +/* This enumeration can be deleted if Android defined it in + * system/core/include/system/graphics.h + */ +enum { + HAL_PIXEL_FORMAT_NV12_Y_TILED_INTEL = 0x100, + HAL_PIXEL_FORMAT_NV12 = 0x10F, + HAL_PIXEL_FORMAT_P010_INTEL = 0x110 +}; + /* The following table is used to look up a DRI image FourCC based * on native format and information contained in android_ycbcr struct. */ static const struct droid_yuv_format droid_yuv_formats[] = { /* Native format, YCrCb, Chroma step, DRI image FourCC */ { HAL_PIXEL_FORMAT_YCbCr_420_888, 0, 2, __DRI_IMAGE_FOURCC_NV12 }, + { HAL_PIXEL_FORMAT_P010_INTEL, 0, 4, __DRI_IMAGE_FOURCC_P010 }, { HAL_PIXEL_FORMAT_YCbCr_420_888, 0, 1, __DRI_IMAGE_FOURCC_YUV420 }, { HAL_PIXEL_FORMAT_YCbCr_420_888, 1, 1, __DRI_IMAGE_FOURCC_YVU420 }, { HAL_PIXEL_FORMAT_YV12, 1, 1, __DRI_IMAGE_FOURCC_YVU420 }, + { HAL_PIXEL_FORMAT_NV12, 0, 2, __DRI_IMAGE_FOURCC_NV12 }, + { HAL_PIXEL_FORMAT_NV12_Y_TILED_INTEL, 0, 2, __DRI_IMAGE_FOURCC_NV12 }, /* HACK: See droid_create_image_from_prime_fd() and * https://issuetracker.google.com/32077885. */ { HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED, 0, 2, __DRI_IMAGE_FOURCC_NV12 }, @@ -249,6 +263,51 @@ droid_window_dequeue_buffer(struct dri2_egl_surface *dri2_surf) return EGL_TRUE; } +static int +droid_resolve_format(struct dri2_egl_display *dri2_dpy, + struct ANativeWindowBuffer *buf) +{ + int format = -1; + int ret; + + if (buf->format != HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED) + return buf->format; +#ifdef HAVE_GRALLOC1 + if(dri2_dpy->gralloc_version == HARDWARE_MODULE_API_VERSION(1, 0)) { + + if (!dri2_dpy->pfn_getFormat) { + _eglLog(_EGL_WARNING, "Gralloc does not support getFormat"); + return -1; + } + ret = dri2_dpy->pfn_getFormat(dri2_dpy->gralloc1_dvc, buf->handle, + &format); + if (ret) { + _eglLog(_EGL_WARNING, "gralloc->getFormat failed: %d", ret); + return -1; + } + } else { +#else + const gralloc_module_t *gralloc0; + gralloc0 = dri2_dpy->gralloc; + + if (!gralloc0->perform) { + _eglLog(_EGL_WARNING, "gralloc->perform not supported"); + return -1; + } + ret = gralloc0->perform(dri2_dpy->gralloc, + GRALLOC_DRM_GET_FORMAT, + buf->handle, &format); + if (ret){ + _eglLog(_EGL_WARNING, "gralloc->perform failed with error: %d", ret); + return -1; + } +#endif +#ifdef HAVE_GRALLOC1 + } +#endif + return format; +} + static EGLBoolean droid_window_enqueue_buffer(_EGLDisplay *disp, struct dri2_egl_surface *dri2_surf) { @@ -463,7 +522,7 @@ droid_swap_interval(_EGLDriver *drv, _EGLDisplay *dpy, struct dri2_egl_surface *dri2_surf = dri2_egl_surface(surf); struct ANativeWindow *window = dri2_surf->window; - if (window->setSwapInterval(window, interval)) + if (window && window->setSwapInterval(window, interval)) return EGL_FALSE; surf->SwapInterval = interval; @@ -664,11 +723,18 @@ droid_query_buffer_age(_EGLDriver *drv, { struct dri2_egl_surface *dri2_surf = dri2_egl_surface(surface); + /* To avoid blocking other EGL calls, release the display mutex before + * we enter droid_window_dequeue_buffer() and re-acquire the mutex upon + * return. + */ + mtx_unlock(&disp->Mutex); if (update_buffers(dri2_surf) < 0) { _eglError(EGL_BAD_ALLOC, "droid_query_buffer_age"); + mtx_lock(&disp->Mutex); return -1; } + mtx_lock(&disp->Mutex); return dri2_surf->back ? dri2_surf->back->age : 0; } @@ -731,6 +797,31 @@ droid_swap_buffers(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *draw) return EGL_TRUE; } +static int get_ycbcr_from_flexlayout(struct android_flex_layout *outFlexLayout, struct android_ycbcr *ycbcr) +{ + + for( int i = 0; i < outFlexLayout->num_planes; i++) { + switch(outFlexLayout->planes[i].component){ + case FLEX_COMPONENT_Y: + ycbcr->y = outFlexLayout->planes[i].top_left; + ycbcr->ystride = outFlexLayout->planes[i].v_increment; + break; + case FLEX_COMPONENT_Cb: + ycbcr->cb = outFlexLayout->planes[i].top_left; + ycbcr->cstride = outFlexLayout->planes[i].v_increment; + break; + case FLEX_COMPONENT_Cr: + ycbcr->cr = outFlexLayout->planes[i].top_left; + ycbcr->chroma_step = outFlexLayout->planes[i].h_increment; + break; + default: + _eglLog(_EGL_WARNING,"unknown component 0x%x", __func__, outFlexLayout->planes[i].component); + break; + } + } + return 0; +} + #if ANDROID_API_LEVEL >= 23 static EGLBoolean droid_set_damage_region(_EGLDriver *drv, @@ -774,30 +865,70 @@ droid_create_image_from_prime_fd_yuv(_EGLDisplay *disp, _EGLContext *ctx, { struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp); struct android_ycbcr ycbcr; +#ifdef HAVE_GRALLOC1 + struct android_flex_layout outFlexLayout; + gralloc1_rect_t accessRegion; +#endif size_t offsets[3]; size_t pitches[3]; int is_ycrcb; int fourcc; int ret; - if (!dri2_dpy->gralloc->lock_ycbcr) { - _eglLog(_EGL_WARNING, "Gralloc does not support lock_ycbcr"); + int format = droid_resolve_format(dri2_dpy, buf); + if (format < 0) { + _eglError(EGL_BAD_PARAMETER, "eglCreateEGLImageKHR"); return NULL; } memset(&ycbcr, 0, sizeof(ycbcr)); - ret = dri2_dpy->gralloc->lock_ycbcr(dri2_dpy->gralloc, buf->handle, - 0, 0, 0, 0, 0, &ycbcr); - if (ret) { - /* HACK: See droid_create_image_from_prime_fd() and - * https://issuetracker.google.com/32077885.*/ - if (buf->format == HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED) - return NULL; - - _eglLog(_EGL_WARNING, "gralloc->lock_ycbcr failed: %d", ret); - return NULL; - } - dri2_dpy->gralloc->unlock(dri2_dpy->gralloc, buf->handle); +#ifdef HAVE_GRALLOC1 + if(dri2_dpy->gralloc_version == HARDWARE_MODULE_API_VERSION(1, 0)) { + if (!dri2_dpy->pfn_lockflex) { + _eglLog(_EGL_WARNING, "Gralloc does not support lockflex"); + return NULL; + } + + ret = dri2_dpy->pfn_lockflex(dri2_dpy->gralloc1_dvc, buf->handle, + 0, 0, &accessRegion, &outFlexLayout, -1); + if (ret) { + _eglLog(_EGL_WARNING, "gralloc->lockflex failed: %d", ret); + return NULL; + } + ret = get_ycbcr_from_flexlayout(&outFlexLayout, &ycbcr); + if (ret) { + _eglLog(_EGL_WARNING, "gralloc->lockflex failed: %d", ret); + return NULL; + } + int outReleaseFence = 0; + dri2_dpy->pfn_unlock(dri2_dpy->gralloc1_dvc, buf->handle, &outReleaseFence); + } else { +#endif + const gralloc_module_t *gralloc0; + gralloc0 = dri2_dpy->gralloc; + + if (!gralloc0->lock_ycbcr) { + _eglLog(_EGL_WARNING, "Gralloc does not support lock_ycbcr"); + return NULL; + } + + ret = gralloc0->lock_ycbcr(gralloc0, buf->handle, + 0, 0, 0, 0, 0, &ycbcr); + + if (ret) { + /* HACK: See droid_create_image_from_prime_fd() and + * https://issuetracker.google.com/32077885.*/ + if (buf->format == HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED) + return NULL; + + _eglLog(_EGL_WARNING, "gralloc->lock_ycbcr failed: %d", ret); + return NULL; + } + + gralloc0->unlock(dri2_dpy->gralloc, buf->handle); +#ifdef HAVE_GRALLOC1 + } +#endif /* When lock_ycbcr's usage argument contains no SW_READ/WRITE flags * it will return the .y/.cb/.cr pointers based on a NULL pointer, @@ -822,14 +953,15 @@ droid_create_image_from_prime_fd_yuv(_EGLDisplay *disp, _EGLContext *ctx, /* .chroma_step is the byte distance between the same chroma channel * values of subsequent pixels, assumed to be the same for Cb and Cr. */ - fourcc = get_fourcc_yuv(buf->format, is_ycrcb, ycbcr.chroma_step); + fourcc = get_fourcc_yuv(format, is_ycrcb, ycbcr.chroma_step); if (fourcc == -1) { _eglLog(_EGL_WARNING, "unsupported YUV format, native = %x, is_ycrcb = %d, chroma_step = %d", - buf->format, is_ycrcb, ycbcr.chroma_step); + format, is_ycrcb, ycbcr.chroma_step); return NULL; } - if (ycbcr.chroma_step == 2) { + /* FIXME? we should not rely on chroma_step */ + if (ycbcr.chroma_step == 2 || ycbcr.chroma_step == 4) { /* Semi-planar Y + CbCr or Y + CrCb format. */ const EGLint attr_list_2plane[] = { EGL_WIDTH, buf->width, @@ -871,9 +1003,16 @@ static _EGLImage * droid_create_image_from_prime_fd(_EGLDisplay *disp, _EGLContext *ctx, struct ANativeWindowBuffer *buf, int fd) { + struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp); unsigned int pitch; - if (is_yuv(buf->format)) { + int format = droid_resolve_format(dri2_dpy, buf); + if (format < 0) { + _eglLog(_EGL_WARNING, "Could not resolve buffer format"); + return NULL; + } + + if (is_yuv(format)) { _EGLImage *image; image = droid_create_image_from_prime_fd_yuv(disp, ctx, buf, fd); @@ -888,13 +1027,13 @@ droid_create_image_from_prime_fd(_EGLDisplay *disp, _EGLContext *ctx, return image; } - const int fourcc = get_fourcc(buf->format); + const int fourcc = get_fourcc(format); if (fourcc == -1) { _eglError(EGL_BAD_PARAMETER, "eglCreateEGLImageKHR"); return NULL; } - pitch = buf->stride * get_format_bpp(buf->format); + pitch = buf->stride * get_format_bpp(format); if (pitch == 0) { _eglError(EGL_BAD_PARAMETER, "eglCreateEGLImageKHR"); return NULL; @@ -1530,6 +1669,7 @@ dri2_initialize_android(_EGLDriver *drv, _EGLDisplay *disp) _EGLDevice *dev; struct dri2_egl_display *dri2_dpy; const char *err; + hw_device_t *device; int ret; /* Not supported yet */ @@ -1547,6 +1687,27 @@ dri2_initialize_android(_EGLDriver *drv, _EGLDisplay *disp) err = "DRI2: failed to get gralloc module"; goto cleanup; } + dri2_dpy->gralloc_version = dri2_dpy->gralloc->module_api_version; +#ifdef HAVE_GRALLOC1 + if (dri2_dpy->gralloc_version == HARDWARE_MODULE_API_VERSION(1, 0)) { + ret = dri2_dpy->gralloc->methods->open(dri2_dpy->gralloc, GRALLOC_HARDWARE_MODULE_ID, &device); + if (ret) { + err = "Failed to open hw_device device"; + goto cleanup; + } else { + dri2_dpy->gralloc1_dvc = (gralloc1_device_t *)device; + + dri2_dpy->pfn_lockflex = (GRALLOC1_PFN_LOCK_FLEX)\ + dri2_dpy->gralloc1_dvc->getFunction(dri2_dpy->gralloc1_dvc, GRALLOC1_FUNCTION_LOCK_FLEX); + + dri2_dpy->pfn_getFormat = (GRALLOC1_PFN_GET_FORMAT)\ + dri2_dpy->gralloc1_dvc->getFunction(dri2_dpy->gralloc1_dvc, GRALLOC1_FUNCTION_GET_FORMAT); + + dri2_dpy->pfn_unlock = (GRALLOC1_PFN_UNLOCK)\ + dri2_dpy->gralloc1_dvc->getFunction(dri2_dpy->gralloc1_dvc, GRALLOC1_FUNCTION_UNLOCK); + } + } +#endif disp->DriverData = (void *) dri2_dpy; diff --git a/src/egl/drivers/dri2/platform_wayland.c b/src/egl/drivers/dri2/platform_wayland.c index c3ca1b6f7bc..3025e34ba63 100644 --- a/src/egl/drivers/dri2/platform_wayland.c +++ b/src/egl/drivers/dri2/platform_wayland.c @@ -637,10 +637,8 @@ update_buffers(struct dri2_egl_surface *dri2_surf) struct dri2_egl_display *dri2_dpy = dri2_egl_display(dri2_surf->base.Resource.Display); - if (dri2_surf->base.Width != dri2_surf->wl_win->attached_width || - dri2_surf->base.Height != dri2_surf->wl_win->attached_height) { - - dri2_wl_release_buffers(dri2_surf); + if (dri2_surf->base.Width != dri2_surf->wl_win->width || + dri2_surf->base.Height != dri2_surf->wl_win->height) { dri2_surf->base.Width = dri2_surf->wl_win->width; dri2_surf->base.Height = dri2_surf->wl_win->height; @@ -648,6 +646,11 @@ update_buffers(struct dri2_egl_surface *dri2_surf) dri2_surf->dy = dri2_surf->wl_win->dy; } + if (dri2_surf->base.Width != dri2_surf->wl_win->attached_width || + dri2_surf->base.Height != dri2_surf->wl_win->attached_height) { + dri2_wl_release_buffers(dri2_surf); + } + if (get_back_bo(dri2_surf) < 0) { _eglError(EGL_BAD_ALLOC, "failed to allocate color buffer"); return -1; diff --git a/src/egl/main/eglcontext.c b/src/egl/main/eglcontext.c index 6b241a524ec..318a85a4f88 100644 --- a/src/egl/main/eglcontext.c +++ b/src/egl/main/eglcontext.c @@ -178,9 +178,12 @@ _eglParseContextAttribList(_EGLContext *ctx, _EGLDisplay *dpy, * is supported for OpenGL contexts, and requesting a * forward-compatible context for OpenGL versions less than 3.0 * will generate an error." + * + * Note: since the forward-compatible flag can be set more than one way, + * the OpenGL version check is performed once, below. */ if ((val & EGL_CONTEXT_OPENGL_FORWARD_COMPATIBLE_BIT_KHR) && - (api != EGL_OPENGL_API || ctx->ClientMajorVersion < 3)) { + api != EGL_OPENGL_API) { err = EGL_BAD_ATTRIBUTE; break; } diff --git a/src/egl/main/eglcurrent.c b/src/egl/main/eglcurrent.c index 479f231fb8f..d20ec64e654 100644 --- a/src/egl/main/eglcurrent.c +++ b/src/egl/main/eglcurrent.c @@ -137,13 +137,37 @@ _eglDestroyThreadInfo(_EGLThreadInfo *t) } +/** + * Delete/free a _EGLThreadInfo object. + */ +static void +_eglDestroyThreadInfoCallback(_EGLThreadInfo *t) +{ + /* If this callback is called on thread termination then try to also give a + * chance to cleanup to the client drivers. If called for module termination + * then just release the thread information as calling eglReleaseThread + * would result in a deadlock. + */ + if (_egl_TSDInitialized) { + /* The callback handler has replaced the TLS entry, which is passed in as + * 't', with NULL. Restore it here so that the release thread finds it in + * the TLS entry. + */ + _eglSetTSD(t); + eglReleaseThread(); + } else { + _eglDestroyThreadInfo(t); + } +} + + /** * Make sure TSD is initialized and return current value. */ static inline _EGLThreadInfo * _eglCheckedGetTSD(void) { - if (_eglInitTSD(&_eglDestroyThreadInfo) != EGL_TRUE) { + if (_eglInitTSD(&_eglDestroyThreadInfoCallback) != EGL_TRUE) { _eglLog(_EGL_FATAL, "failed to initialize \"current\" system"); return NULL; } diff --git a/src/egl/main/egldevice.c b/src/egl/main/egldevice.c index 4878039be0e..c5c9a21273a 100644 --- a/src/egl/main/egldevice.c +++ b/src/egl/main/egldevice.c @@ -202,18 +202,6 @@ _eglDeviceSupports(_EGLDevice *dev, _EGLDeviceExtension ext) }; } -/* Ideally we'll have an extension which passes the render node, - * instead of the card one + magic. - * - * Then we can move this in _eglQueryDeviceStringEXT below. Until then - * keep it separate. - */ -const char * -_eglGetDRMDeviceRenderNode(_EGLDevice *dev) -{ - return dev->device->nodes[DRM_NODE_RENDER]; -} - EGLBoolean _eglQueryDeviceAttribEXT(_EGLDevice *dev, EGLint attribute, EGLAttrib *value) diff --git a/src/egl/main/egldevice.h b/src/egl/main/egldevice.h index 83a47d5eacc..883f96f8e30 100644 --- a/src/egl/main/egldevice.h +++ b/src/egl/main/egldevice.h @@ -68,9 +68,6 @@ typedef enum _egl_device_extension _EGLDeviceExtension; EGLBoolean _eglDeviceSupports(_EGLDevice *dev, _EGLDeviceExtension ext); -const char * -_eglGetDRMDeviceRenderNode(_EGLDevice *dev); - EGLBoolean _eglQueryDeviceAttribEXT(_EGLDevice *dev, EGLint attribute, EGLAttrib *value); diff --git a/src/egl/meson.build b/src/egl/meson.build index a23cc36fc2b..b7ff09e9fed 100644 --- a/src/egl/meson.build +++ b/src/egl/meson.build @@ -93,10 +93,11 @@ if with_dri2 'drivers/dri2/egl_dri2.h', 'drivers/dri2/egl_dri2_fallbacks.h', ) + link_for_egl += [libloader, libxmlconfig] + incs_for_egl += inc_loader if with_platform_x11 files_egl += files('drivers/dri2/platform_x11.c') - incs_for_egl += inc_loader if with_dri3 files_egl += files('drivers/dri2/platform_x11_dri3.c') link_for_egl += libloader_dri3_helper @@ -105,13 +106,12 @@ if with_dri2 endif if with_platform_drm files_egl += files('drivers/dri2/platform_drm.c') - link_for_egl += [libloader, libgbm, libxmlconfig] - incs_for_egl += [inc_loader, inc_gbm, include_directories('../gbm/main')] + link_for_egl += libgbm + incs_for_egl += [inc_gbm, include_directories('../gbm/main')] deps_for_egl += dep_libdrm endif if with_platform_surfaceless files_egl += files('drivers/dri2/platform_surfaceless.c') - incs_for_egl += [inc_loader] endif if with_platform_wayland deps_for_egl += [dep_wayland_client, dep_wayland_server, dep_wayland_egl_headers] @@ -127,7 +127,6 @@ if with_dri2 if with_platform_android deps_for_egl += dep_android files_egl += files('drivers/dri2/platform_android.c') - incs_for_egl += [inc_loader] endif elif with_platform_haiku incs_for_egl += inc_haikugl @@ -166,7 +165,7 @@ libegl = shared_library( '-D_EGL_NATIVE_PLATFORM=_EGL_PLATFORM_@0@'.format(egl_native_platform.to_upper()), ], include_directories : incs_for_egl, - link_with : [link_for_egl, libloader, libxmlconfig, libglapi, libmesa_util], + link_with : [link_for_egl, libglapi, libmesa_util], link_args : [ld_args_bsymbolic, ld_args_gc_sections], dependencies : [deps_for_egl, dep_dl, dep_libdrm, dep_clock, dep_thread], install : true, diff --git a/src/freedreno/Makefile.am b/src/freedreno/Makefile.am index 460fb87fb46..342f03d644c 100644 --- a/src/freedreno/Makefile.am +++ b/src/freedreno/Makefile.am @@ -45,6 +45,7 @@ TESTS = BUILT_SOURCES = CLEANFILES = EXTRA_DIST = \ + meson.build \ drm/meson.build \ ir3/ir3_nir_trig.py \ ir3/meson.build diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c index d9fcf798b3d..68926c9553b 100644 --- a/src/freedreno/ir3/ir3_nir.c +++ b/src/freedreno/ir3/ir3_nir.c @@ -97,7 +97,7 @@ ir3_optimize_loop(nir_shader *s) progress |= OPT(s, nir_opt_gcm, true); else if (gcm == 2) progress |= OPT(s, nir_opt_gcm, false); - progress |= OPT(s, nir_opt_peephole_select, 16, true, true); + progress |= OPT(s, nir_opt_peephole_select, 16, true); progress |= OPT(s, nir_opt_intrinsics); progress |= OPT(s, nir_opt_algebraic); progress |= OPT(s, nir_opt_constant_folding); diff --git a/src/gallium/auxiliary/Android.mk b/src/gallium/auxiliary/Android.mk index 7618c6fcd93..fe976501451 100644 --- a/src/gallium/auxiliary/Android.mk +++ b/src/gallium/auxiliary/Android.mk @@ -32,8 +32,11 @@ LOCAL_SRC_FILES := \ $(C_SOURCES) \ $(NIR_SOURCES) \ $(RENDERONLY_SOURCES) \ - $(VL_STUB_SOURCES) \ - util/u_debug_stack_android.cpp + $(VL_STUB_SOURCES) + +ifeq ($(USE_LIBBACKTRACE),true) + LOCAL_SRC_FILES += util/u_debug_stack_android.cpp +endif LOCAL_C_INCLUDES := \ $(GALLIUM_TOP)/auxiliary/util \ diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c index 3fc096789c0..f8c69585e6a 100644 --- a/src/gallium/auxiliary/draw/draw_context.c +++ b/src/gallium/auxiliary/draw/draw_context.c @@ -950,6 +950,8 @@ draw_set_mapped_so_targets(struct draw_context *draw, { int i; + draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE ); + for (i = 0; i < num_targets; i++) draw->so.targets[i] = targets[i]; for (i = num_targets; i < PIPE_MAX_SO_BUFFERS; i++) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp index fcbdd5050fe..f307c26d4f7 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp +++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp @@ -556,11 +556,11 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT, llvm::SmallVector MAttrs; -#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) -#if HAVE_LLVM >= 0x0400 - /* llvm-3.7+ implements sys::getHostCPUFeatures for x86, - * which allows us to enable/disable code generation based - * on the results of cpuid. +#if HAVE_LLVM >= 0x0400 && (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) || defined(PIPE_ARCH_ARM)) + /* llvm-3.3+ implements sys::getHostCPUFeatures for Arm + * and llvm-3.7+ for x86, which allows us to enable/disable + * code generation based on the results of cpuid on these + * architectures. */ llvm::StringMap features; llvm::sys::getHostCPUFeatures(features); @@ -570,7 +570,7 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT, ++f) { MAttrs.push_back(((*f).second ? "+" : "-") + (*f).first().str()); } -#else +#elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) /* * We need to unset attributes because sometimes LLVM mistakenly assumes * certain features are present given the processor name. @@ -625,6 +625,12 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT, MAttrs.push_back("-avx512vl"); #endif #endif +#if defined(PIPE_ARCH_ARM) + if (!util_cpu_caps.has_neon) { + MAttrs.push_back("-neon"); + MAttrs.push_back("-crypto"); + MAttrs.push_back("-vfp2"); + } #endif #if defined(PIPE_ARCH_PPC) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c index 0f5b3d9acb7..d6af1d84471 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c @@ -1108,7 +1108,7 @@ get_indirect_index(struct lp_build_tgsi_soa_context *bld, * larger than the declared size but smaller than the buffer size. */ if (reg_file != TGSI_FILE_CONSTANT) { - assert(index_limit > 0); + assert(index_limit >= 0); max_index = lp_build_const_int_vec(bld->bld_base.base.gallivm, uint_bld->type, index_limit); diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h index 64f2598a259..09eac4da95a 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_scan.h +++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h @@ -220,7 +220,9 @@ tgsi_is_bindless_image_file(unsigned file) { return file != TGSI_FILE_IMAGE && file != TGSI_FILE_MEMORY && - file != TGSI_FILE_BUFFER; + file != TGSI_FILE_BUFFER && + file != TGSI_FILE_CONSTBUF && + file != TGSI_FILE_HW_ATOMIC; } #ifdef __cplusplus diff --git a/src/gallium/auxiliary/util/u_threaded_context.c b/src/gallium/auxiliary/util/u_threaded_context.c index 8e3bceae18d..b596c322918 100644 --- a/src/gallium/auxiliary/util/u_threaded_context.c +++ b/src/gallium/auxiliary/util/u_threaded_context.c @@ -1524,7 +1524,8 @@ tc_buffer_do_flush_region(struct threaded_context *tc, if (ttrans->staging) { struct pipe_box src_box; - u_box_1d(ttrans->offset + box->x % tc->map_buffer_alignment, + u_box_1d(ttrans->offset + ttrans->b.box.x % tc->map_buffer_alignment + + (box->x - ttrans->b.box.x), box->width, &src_box); /* Copy the staging buffer into the original one. */ diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst index eaf492ce8b0..b927d014179 100644 --- a/src/gallium/docs/source/screen.rst +++ b/src/gallium/docs/source/screen.rst @@ -487,6 +487,10 @@ The integer capabilities: * ``PIPE_CAP_DEST_SURFACE_SRGB_CONTROL``: Indicates whether the drivers supports switching the format between sRGB and linear for a surface that is used as destination in draw and blit calls. +* ``PIPE_CAP_MAX_VARYINGS``: The maximum number of fragment shader + varyings. This will generally correspond to + ``PIPE_SHADER_CAP_MAX_INPUTS`` for the fragment shader, but in some + cases may be a smaller number. .. _pipe_capf: diff --git a/src/gallium/drivers/etnaviv/etnaviv_screen.c b/src/gallium/drivers/etnaviv/etnaviv_screen.c index fd320232528..35dcac1409b 100644 --- a/src/gallium/drivers/etnaviv/etnaviv_screen.c +++ b/src/gallium/drivers/etnaviv/etnaviv_screen.c @@ -360,6 +360,9 @@ etna_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER: return 0; + case PIPE_CAP_MAX_VARYINGS: + return screen->specs.max_varyings; + case PIPE_CAP_PCI_GROUP: case PIPE_CAP_PCI_BUS: case PIPE_CAP_PCI_DEVICE: diff --git a/src/gallium/drivers/freedreno/Makefile.am b/src/gallium/drivers/freedreno/Makefile.am index fe409fa5f52..dbc15f40389 100644 --- a/src/gallium/drivers/freedreno/Makefile.am +++ b/src/gallium/drivers/freedreno/Makefile.am @@ -23,4 +23,6 @@ libfreedreno_la_SOURCES = \ $(a6xx_SOURCES) \ $(ir3_SOURCES) -EXTRA_DIST = meson.build +EXTRA_DIST = \ + ir3/ir3_cmdline.c \ + meson.build diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c index 772127c7478..498c1eae1d7 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c @@ -339,7 +339,6 @@ clear_fast(struct fd_batch *batch, struct fd_ringbuffer *ring, OUT_PKT3(ring, CP_SET_CONSTANT, 2); OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_SCREEN_SCISSOR_BR)); OUT_RINGP(ring, patch_type, &batch->gmem_patches); - OUT_RING(ring, 0); OUT_PKT3(ring, CP_SET_CONSTANT, 4); OUT_RING(ring, CP_REG(REG_A2XX_RB_SURFACE_INFO)); diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c b/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c index 1c073e31739..692188ebd4e 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c @@ -59,6 +59,28 @@ static uint32_t fmt2swap(enum pipe_format format) } } +static bool +use_hw_binning(struct fd_batch *batch) +{ + struct fd_gmem_stateobj *gmem = &batch->ctx->gmem; + + /* we hardcoded a limit of 8 "pipes", we can increase this limit + * at the cost of a slightly larger command stream + * however very few cases will need more than 8 + * gmem->num_vsc_pipes == 0 means empty batch (TODO: does it still happen?) + */ + if (gmem->num_vsc_pipes > 8 || !gmem->num_vsc_pipes) + return false; + + /* only a20x hw binning is implement + * a22x is more like a3xx, but perhaps the a20x works? (TODO) + */ + if (!is_a20x(batch->ctx->screen)) + return false; + + return fd_binning_enabled && ((gmem->nbins_x * gmem->nbins_y) > 2); +} + /* transfer from gmem to system memory (ie. normal RAM) */ static void @@ -272,7 +294,7 @@ fd2_emit_tile_mem2gmem(struct fd_batch *batch, struct fd_tile *tile) x1 = ((float)tile->xoff + bin_w) / ((float)pfb->width); y0 = ((float)tile->yoff) / ((float)pfb->height); y1 = ((float)tile->yoff + bin_h) / ((float)pfb->height); - OUT_PKT3(ring, CP_MEM_WRITE, 9); + OUT_PKT3(ring, CP_MEM_WRITE, 7); OUT_RELOC(ring, fd_resource(fd2_ctx->solid_vertexbuf)->bo, 36, 0, 0); OUT_RING(ring, fui(x0)); OUT_RING(ring, fui(y0)); @@ -280,8 +302,6 @@ fd2_emit_tile_mem2gmem(struct fd_batch *batch, struct fd_tile *tile) OUT_RING(ring, fui(y0)); OUT_RING(ring, fui(x0)); OUT_RING(ring, fui(y1)); - OUT_RING(ring, fui(x1)); - OUT_RING(ring, fui(y1)); OUT_PKT3(ring, CP_SET_CONSTANT, 2); OUT_RING(ring, CP_REG(REG_A2XX_VGT_INDX_OFFSET)); @@ -492,18 +512,18 @@ fd2_emit_tile_init(struct fd_batch *batch) /* note: 1 "line" is 512 bytes in both color/depth areas (1K total) */ switch (patch->val) { case GMEM_PATCH_FASTCLEAR_COLOR: - size = align(gmem->bin_w * gmem->bin_h * color_size, 0x4000); + size = align(gmem->bin_w * gmem->bin_h * color_size, 0x8000); lines = size / 1024; depth_base = size / 2; break; case GMEM_PATCH_FASTCLEAR_DEPTH: - size = align(gmem->bin_w * gmem->bin_h * depth_size, 0x4000); + size = align(gmem->bin_w * gmem->bin_h * depth_size, 0x8000); lines = size / 1024; color_base = depth_base; depth_base = depth_base + size / 2; break; case GMEM_PATCH_FASTCLEAR_COLOR_DEPTH: - lines = align(gmem->bin_w * gmem->bin_h * color_size * 2, 0x4000) / 1024; + lines = align(gmem->bin_w * gmem->bin_h * color_size * 2, 0x8000) / 1024; break; case GMEM_PATCH_RESTORE_INFO: patch->cs[0] = gmem->bin_w; @@ -535,7 +555,7 @@ fd2_emit_tile_init(struct fd_batch *batch) OUT_RING(ring, CP_REG(REG_A2XX_VGT_CURRENT_BIN_ID_MAX)); OUT_RING(ring, 0); - if (is_a20x(ctx->screen) && fd_binning_enabled && gmem->num_vsc_pipes) { + if (use_hw_binning(batch)) { /* patch out unneeded memory exports by changing EXEC CF to EXEC_END * * in the shader compiler, we guarantee that the shader ends with @@ -694,7 +714,7 @@ fd2_emit_tile_renderprep(struct fd_batch *batch, struct fd_tile *tile) OUT_RING(ring, fui(0.0f)); } - if (is_a20x(ctx->screen) && fd_binning_enabled) { + if (use_hw_binning(batch)) { struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[tile->p]; OUT_PKT3(ring, CP_SET_CONSTANT, 2); diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_resource.c b/src/gallium/drivers/freedreno/a2xx/fd2_resource.c index 1bd1f103ccd..2c813804689 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_resource.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_resource.c @@ -55,6 +55,12 @@ fd2_setup_slices(struct fd_resource *rsc) break; } + /* mipmaps have power of two sizes in memory */ + if (level) { + width = util_next_power_of_two(width); + height = util_next_power_of_two(height); + } + slice->pitch = width; slice->offset = size; diff --git a/src/gallium/drivers/freedreno/a2xx/ir2_nir.c b/src/gallium/drivers/freedreno/a2xx/ir2_nir.c index 5d92f86befc..b206911270a 100644 --- a/src/gallium/drivers/freedreno/a2xx/ir2_nir.c +++ b/src/gallium/drivers/freedreno/a2xx/ir2_nir.c @@ -74,7 +74,7 @@ ir2_optimize_loop(nir_shader *s) progress |= OPT(s, nir_opt_dce); progress |= OPT(s, nir_opt_cse); /* progress |= OPT(s, nir_opt_gcm, true); */ - progress |= OPT(s, nir_opt_peephole_select, UINT_MAX, true, true); + progress |= OPT(s, nir_opt_peephole_select, UINT_MAX, true); progress |= OPT(s, nir_opt_intrinsics); progress |= OPT(s, nir_opt_algebraic); progress |= OPT(s, nir_opt_constant_folding); diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_blitter.c b/src/gallium/drivers/freedreno/a6xx/fd6_blitter.c index 460255f748a..c8719636182 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_blitter.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_blitter.c @@ -438,7 +438,7 @@ emit_blit_texture(struct fd_ringbuffer *ring, const struct pipe_blit_info *info) OUT_RING(ring, A6XX_RB_2D_DST_INFO_COLOR_FORMAT(dfmt) | A6XX_RB_2D_DST_INFO_TILE_MODE(dtile) | A6XX_RB_2D_DST_INFO_COLOR_SWAP(dswap)); - OUT_RELOC(ring, dst->bo, doff, 0, 0); /* RB_2D_DST_LO/HI */ + OUT_RELOCW(ring, dst->bo, doff, 0, 0); /* RB_2D_DST_LO/HI */ OUT_RING(ring, A6XX_RB_2D_DST_SIZE_PITCH(dpitch)); OUT_RING(ring, 0x00000000); OUT_RING(ring, 0x00000000); diff --git a/src/gallium/drivers/freedreno/freedreno_resource.c b/src/gallium/drivers/freedreno/freedreno_resource.c index ce8e4480be1..1879d2c60ed 100644 --- a/src/gallium/drivers/freedreno/freedreno_resource.c +++ b/src/gallium/drivers/freedreno/freedreno_resource.c @@ -853,7 +853,13 @@ fd_resource_create_with_modifiers(struct pipe_screen *pscreen, enum pipe_format format = tmpl->format; uint32_t size; - if (screen->ro && (tmpl->bind & PIPE_BIND_SCANOUT)) { + /* when using kmsro, scanout buffers are allocated on the display device + * create_with_modifiers() doesn't give us usage flags, so we have to + * assume that all calls with modifiers are scanout-possible + */ + if (screen->ro && + ((tmpl->bind & PIPE_BIND_SCANOUT) || + !(count == 1 && modifiers[0] == DRM_FORMAT_MOD_INVALID))) { struct pipe_resource scanout_templat = *tmpl; struct renderonly_scanout *scanout; struct winsys_handle handle; diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c index e596a4e8462..c3b08ab0e0f 100644 --- a/src/gallium/drivers/freedreno/freedreno_screen.c +++ b/src/gallium/drivers/freedreno/freedreno_screen.c @@ -317,6 +317,9 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_MAX_VIEWPORTS: return 1; + case PIPE_CAP_MAX_VARYINGS: + return 16; + case PIPE_CAP_SHAREABLE_SHADERS: case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY: /* manage the variants for these ourself, to avoid breaking precompile: */ diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c index a7b4a43c015..78707c66e62 100644 --- a/src/gallium/drivers/i915/i915_screen.c +++ b/src/gallium/drivers/i915/i915_screen.c @@ -402,6 +402,8 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap) return 0; case PIPE_CAP_ENDIANNESS: return PIPE_ENDIAN_LITTLE; + case PIPE_CAP_MAX_VARYINGS: + return 10; case PIPE_CAP_VENDOR_ID: return 0x8086; diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c index c95016a6cbe..b55b4a3c4fe 100644 --- a/src/gallium/drivers/llvmpipe/lp_screen.c +++ b/src/gallium/drivers/llvmpipe/lp_screen.c @@ -310,6 +310,8 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param) return 1; case PIPE_CAP_CLEAR_TEXTURE: return 1; + case PIPE_CAP_MAX_VARYINGS: + return 32; case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: diff --git a/src/gallium/drivers/nouveau/codegen/lib/gk104.asm b/src/gallium/drivers/nouveau/codegen/lib/gk104.asm index cd65b547279..576da1bab60 100644 --- a/src/gallium/drivers/nouveau/codegen/lib/gk104.asm +++ b/src/gallium/drivers/nouveau/codegen/lib/gk104.asm @@ -543,6 +543,8 @@ $p2 suldgb b32 $r3 cg zero u8 g[$r4d] $r2 $p0 $p1 suldgb b32 $r3 cv zero u8 g[$r4d] $r2 $p0 long mov b32 $r3 0x3f800000 long nop +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +long nop long ret @@ -554,7 +556,144 @@ long ret // SIZE: 9 * 8 bytes // gk104_rcp_f64: - long nop + // Step 1: classify input according to exponent and value, and calculate + // result for 0/inf/nan. $r2 holds the exponent value, which starts at + // bit 52 (bit 20 of the upper half) and is 11 bits in length + ext u32 $r2 $r1 0xb14 + add b32 $r3 $r2 0xffffffff + joinat #rcp_rejoin + // We want to check whether the exponent is 0 or 0x7ff (i.e. NaN, inf, + // denorm, or 0). Do this by substracting 1 from the exponent, which will + // mean that it's > 0x7fd in those cases when doing unsigned comparison + set $p0 0x1 gt u32 $r3 0x7fd + // $r3: 0 for norms, 0x36 for denorms, -1 for others + long mov b32 $r3 0x0 + sched 0x2f 0x04 0x2d 0x2b 0x2f 0x28 0x28 + join (not $p0) nop + // Process all special values: NaN, inf, denorm, 0 + mov b32 $r3 0xffffffff + // A number is NaN if its abs value is greater than or unordered with inf + set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000 + (not $p0) bra #rcp_inf_or_denorm_or_zero + // NaN -> NaN, the next line sets the "quiet" bit of the result. This + // behavior is both seen on the CPU and the blob + join or b32 $r1 $r1 0x80000 +rcp_inf_or_denorm_or_zero: + and b32 $r4 $r1 0x7ff00000 + // Other values with nonzero in exponent field should be inf + set $p0 0x1 eq s32 $r4 0x0 + sched 0x2b 0x04 0x2f 0x2d 0x2b 0x2f 0x20 + $p0 bra #rcp_denorm_or_zero + // +/-Inf -> +/-0 + xor b32 $r1 $r1 0x7ff00000 + join mov b32 $r0 0x0 +rcp_denorm_or_zero: + set $p0 0x1 gtu f64 abs $r0d 0x0 + $p0 bra #rcp_denorm + // +/-0 -> +/-Inf + join or b32 $r1 $r1 0x7ff00000 +rcp_denorm: + // non-0 denorms: multiply with 2^54 (the 0x36 in $r3), join with norms + mul rn f64 $r0d $r0d 0x4350000000000000 + sched 0x2f 0x28 0x2b 0x28 0x28 0x04 0x28 + join mov b32 $r3 0x36 +rcp_rejoin: + // All numbers with -1 in $r3 have their result ready in $r0d, return them + // others need further calculation + set $p0 0x1 lt s32 $r3 0x0 + $p0 bra #rcp_end + // Step 2: Before the real calculation goes on, renormalize the values to + // range [1, 2) by setting exponent field to 0x3ff (the exponent of 1) + // result in $r6d. The exponent will be recovered later. + ext u32 $r2 $r1 0xb14 + and b32 $r7 $r1 0x800fffff + add b32 $r7 $r7 0x3ff00000 + long mov b32 $r6 $r0 + sched 0x2b 0x04 0x28 0x28 0x2a 0x2b 0x2e + // Step 3: Convert new value to float (no overflow will occur due to step + // 2), calculate rcp and do newton-raphson step once + cvt rz f32 $r5 f64 $r6d + long rcp f32 $r4 $r5 + mov b32 $r0 0xbf800000 + fma rn f32 $r5 $r4 $r5 $r0 + fma rn f32 $r0 neg $r4 $r5 $r4 + // Step 4: convert result $r0 back to double, do newton-raphson steps + cvt f64 $r0d f32 $r0 + cvt f64 $r6d neg f64 $r6d + sched 0x2e 0x29 0x29 0x29 0x29 0x29 0x29 + cvt f64 $r8d f32 0x3f800000 + // 4 Newton-Raphson Steps, tmp in $r4d, result in $r0d + // The formula used here (and above) is: + // RCP_{n + 1} = 2 * RCP_{n} - x * RCP_{n} * RCP_{n} + // The following code uses 2 FMAs for each step, and it will basically + // looks like: + // tmp = -src * RCP_{n} + 1 + // RCP_{n + 1} = RCP_{n} * tmp + RCP_{n} + fma rn f64 $r4d $r6d $r0d $r8d + fma rn f64 $r0d $r0d $r4d $r0d + fma rn f64 $r4d $r6d $r0d $r8d + fma rn f64 $r0d $r0d $r4d $r0d + fma rn f64 $r4d $r6d $r0d $r8d + fma rn f64 $r0d $r0d $r4d $r0d + sched 0x29 0x20 0x28 0x28 0x28 0x28 0x28 + fma rn f64 $r4d $r6d $r0d $r8d + fma rn f64 $r0d $r0d $r4d $r0d + // Step 5: Exponent recovery and final processing + // The exponent is recovered by adding what we added to the exponent. + // Suppose we want to calculate rcp(x), but we have rcp(cx), then + // rcp(x) = c * rcp(cx) + // The delta in exponent comes from two sources: + // 1) The renormalization in step 2. The delta is: + // 0x3ff - $r2 + // 2) (For the denorm input) The 2^54 we multiplied at rcp_denorm, stored + // in $r3 + // These 2 sources are calculated in the first two lines below, and then + // added to the exponent extracted from the result above. + // Note that after processing, the new exponent may >= 0x7ff (inf) + // or <= 0 (denorm). Those cases will be handled respectively below + subr b32 $r2 $r2 0x3ff + long add b32 $r4 $r2 $r3 + ext u32 $r3 $r1 0xb14 + // New exponent in $r3 + long add b32 $r3 $r3 $r4 + add b32 $r2 $r3 0xffffffff + sched 0x28 0x2b 0x28 0x2b 0x28 0x28 0x2b + // (exponent-1) < 0x7fe (unsigned) means the result is in norm range + // (same logic as in step 1) + set $p0 0x1 lt u32 $r2 0x7fe + (not $p0) bra #rcp_result_inf_or_denorm + // Norms: convert exponents back and return + shl b32 $r4 $r4 clamp 0x14 + long add b32 $r1 $r4 $r1 + bra #rcp_end +rcp_result_inf_or_denorm: + // New exponent >= 0x7ff means that result is inf + set $p0 0x1 ge s32 $r3 0x7ff + (not $p0) bra #rcp_result_denorm + sched 0x20 0x25 0x28 0x2b 0x23 0x25 0x2f + // Infinity + and b32 $r1 $r1 0x80000000 + long mov b32 $r0 0x0 + add b32 $r1 $r1 0x7ff00000 + bra #rcp_end +rcp_result_denorm: + // Denorm result comes from huge input. The greatest possible fp64, i.e. + // 0x7fefffffffffffff's rcp is 0x0004000000000000, 1/4 of the smallest + // normal value. Other rcp result should be greater than that. If we + // set the exponent field to 1, we can recover the result by multiplying + // it with 1/2 or 1/4. 1/2 is used if the "exponent" $r3 is 0, otherwise + // 1/4 ($r3 should be -1 then). This is quite tricky but greatly simplifies + // the logic here. + set $p0 0x1 ne u32 $r3 0x0 + and b32 $r1 $r1 0x800fffff + // 0x3e800000: 1/4 + $p0 cvt f64 $r6d f32 0x3e800000 + sched 0x2f 0x28 0x2c 0x2e 0x2a 0x20 0x27 + // 0x3f000000: 1/2 + (not $p0) cvt f64 $r6d f32 0x3f000000 + add b32 $r1 $r1 0x00100000 + mul rn f64 $r0d $r0d $r6d +rcp_end: long ret // RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i) @@ -565,7 +704,67 @@ gk104_rcp_f64: // SIZE: 14 * 8 bytes // gk104_rsq_f64: - long nop + // Before getting initial result rsqrt64h, two special cases should be + // handled first. + // 1. NaN: set the highest bit in mantissa so it'll be surely recognized + // as NaN in rsqrt64h + set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000 + $p0 or b32 $r1 $r1 0x00080000 + and b32 $r2 $r1 0x7fffffff + sched 0x27 0x20 0x28 0x2c 0x25 0x28 0x28 + // 2. denorms and small normal values: using their original value will + // lose precision either at rsqrt64h or the first step in newton-raphson + // steps below. Take 2 as a threshold in exponent field, and multiply + // with 2^54 if the exponent is smaller or equal. (will multiply 2^27 + // to recover in the end) + ext u32 $r3 $r1 0xb14 + set $p1 0x1 le u32 $r3 0x2 + long or b32 $r2 $r0 $r2 + $p1 mul rn f64 $r0d $r0d 0x4350000000000000 + rsqrt64h $r5 $r1 + // rsqrt64h will give correct result for 0/inf/nan, the following logic + // checks whether the input is one of those (exponent is 0x7ff or all 0 + // except for the sign bit) + set b32 $r6 ne u32 $r3 0x7ff + long and b32 $r2 $r2 $r6 + sched 0x28 0x2b 0x20 0x27 0x28 0x2e 0x28 + set $p0 0x1 ne u32 $r2 0x0 + $p0 bra #rsq_norm + // For 0/inf/nan, make sure the sign bit agrees with input and return + and b32 $r1 $r1 0x80000000 + long mov b32 $r0 0x0 + long or b32 $r1 $r1 $r5 + long ret +rsq_norm: + // For others, do 4 Newton-Raphson steps with the formula: + // RSQ_{n + 1} = RSQ_{n} * (1.5 - 0.5 * x * RSQ_{n} * RSQ_{n}) + // In the code below, each step is written as: + // tmp1 = 0.5 * x * RSQ_{n} + // tmp2 = -RSQ_{n} * tmp1 + 0.5 + // RSQ_{n + 1} = RSQ_{n} * tmp2 + RSQ_{n} + long mov b32 $r4 0x0 + sched 0x2f 0x29 0x29 0x29 0x29 0x29 0x29 + // 0x3f000000: 1/2 + cvt f64 $r8d f32 0x3f000000 + mul rn f64 $r2d $r0d $r8d + mul rn f64 $r0d $r2d $r4d + fma rn f64 $r6d neg $r4d $r0d $r8d + fma rn f64 $r4d $r4d $r6d $r4d + mul rn f64 $r0d $r2d $r4d + fma rn f64 $r6d neg $r4d $r0d $r8d + sched 0x29 0x29 0x29 0x29 0x29 0x29 0x29 + fma rn f64 $r4d $r4d $r6d $r4d + mul rn f64 $r0d $r2d $r4d + fma rn f64 $r6d neg $r4d $r0d $r8d + fma rn f64 $r4d $r4d $r6d $r4d + mul rn f64 $r0d $r2d $r4d + fma rn f64 $r6d neg $r4d $r0d $r8d + fma rn f64 $r4d $r4d $r6d $r4d + sched 0x29 0x20 0x28 0x2e 0x00 0x00 0x00 + // Multiply 2^27 to result for small inputs to recover + $p1 mul rn f64 $r4d $r4d 0x41a0000000000000 + long mov b32 $r1 $r5 + long mov b32 $r0 $r4 long ret // diff --git a/src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h b/src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h index 37998768efe..ed948dee471 100644 --- a/src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h +++ b/src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h @@ -481,12 +481,132 @@ uint64_t gk104_builtin_code[] = { 0xd40040000840c785, 0x18fe00000000dde2, 0x4000000000001de4, - 0x9000000000001de7, -/* 0x0f08: gk104_rcp_f64 */ + 0x2000000000000007, 0x4000000000001de4, 0x9000000000001de7, -/* 0x0f18: gk104_rsq_f64 */ - 0x4000000000001de4, +/* 0x0f18: gk104_rcp_f64 */ + 0x7000c02c50109c03, + 0x0bfffffffc20dc02, + 0x6000000280000007, + 0x1a0ec01ff431dc03, + 0x180000000000dde2, + 0x228282f2b2d042f7, + 0x40000000000021f4, + 0x1bfffffffc00dde2, + 0x1e0edffc0001dc81, + 0x40000000200021e7, + 0x3800200000105c52, +/* 0x0f70: rcp_inf_or_denorm_or_zero */ + 0x39ffc00000111c02, + 0x190e0000fc41dc23, + 0x2202f2b2d2f042b7, + 0x40000000400001e7, + 0x39ffc00000105c82, + 0x1800000000001df2, +/* 0x0fa0: rcp_denorm_or_zero */ + 0x1e0ec0000001dc81, + 0x40000000200001e7, + 0x39ffc00000105c52, +/* 0x0fb8: rcp_denorm */ + 0x5000d0d400001c01, + 0x2280428282b282f7, + 0x18000000d800ddf2, +/* 0x0fd0: rcp_rejoin */ + 0x188e0000fc31dc23, + 0x40000006000001e7, + 0x7000c02c50109c03, + 0x3a003ffffc11dc02, + 0x08ffc0000071dc02, + 0x2800000000019de4, + 0x22e2b2a2828042b7, + 0x1006000019a15c04, + 0xc800000010511c00, + 0x1afe000000001de2, + 0x3000000014415c00, + 0x3008000014401e00, + 0x1000000001301c04, + 0x1000000019b19d04, + 0x22929292929292e7, + 0x1000cfe001321c04, + 0x2010000000611c01, + 0x2000000010001c01, + 0x2010000000611c01, + 0x2000000010001c01, + 0x2010000000611c01, + 0x2000000010001c01, + 0x2282828282820297, + 0x2010000000611c01, + 0x2000000010001c01, + 0x0800000ffc209e02, + 0x480000000c211c03, + 0x7000c02c5010dc03, + 0x480000001030dc03, + 0x0bfffffffc309c02, + 0x22b28282b282b287, + 0x188ec01ff821dc03, + 0x40000000600021e7, + 0x6000c00050411c03, + 0x4800000004405c03, + 0x40000001c0001de7, +/* 0x10f0: rcp_result_inf_or_denorm */ + 0x1b0ec01ffc31dc23, + 0x40000000a00021e7, + 0x22f25232b2825207, + 0x3a00000000105c02, + 0x1800000000001de2, + 0x09ffc00000105c02, + 0x40000000e0001de7, +/* 0x1128: rcp_result_denorm */ + 0x1a8e0000fc31dc03, + 0x3a003ffffc105c02, + 0x1000cfa001318004, + 0x227202a2e2c282f7, + 0x1000cfc00131a004, + 0x0800400000105c02, + 0x5000000018001c01, +/* 0x1160: rcp_end */ + 0x9000000000001de7, +/* 0x1168: gk104_rsq_f64 */ + 0x1e0edffc0001dc81, + 0x3800200000104042, + 0x39fffffffc109c02, + 0x22828252c2820277, + 0x7000c02c5010dc03, + 0x198ec0000833dc03, + 0x6800000008009c43, + 0x5000d0d400000401, + 0xc80000001c115c00, + 0x128ec01ffc319c03, + 0x6800000018209c03, + 0x2282e2827202b287, + 0x1a8e0000fc21dc03, + 0x40000000800001e7, + 0x3a00000000105c02, + 0x1800000000001de2, + 0x6800000014105c43, + 0x9000000000001de7, +/* 0x11f8: rsq_norm */ + 0x1800000000011de2, + 0x22929292929292f7, + 0x1000cfc001321c04, + 0x5000000020009c01, + 0x5000000010201c01, + 0x2010000000419e01, + 0x2008000018411c01, + 0x5000000010201c01, + 0x2010000000419e01, + 0x2292929292929297, + 0x2008000018411c01, + 0x5000000010201c01, + 0x2010000000419e01, + 0x2008000018411c01, + 0x5000000010201c01, + 0x2010000000419e01, + 0x2008000018411c01, + 0x20000002e2820297, + 0x5000d06800410401, + 0x2800000014005de4, + 0x2800000010001de4, 0x9000000000001de7, 0xc800000003f01cc5, 0x2c00000100005c04, @@ -495,7 +615,7 @@ uint64_t gk104_builtin_code[] = { 0x680100000c1fdc03, 0x4000000a60001c47, 0x180000004000dde2, -/* 0x0f60: spill_cfstack */ +/* 0x12e0: spill_cfstack */ 0x78000009c0000007, 0x0c0000000430dd02, 0x4003ffffa0001ca7, @@ -543,14 +663,14 @@ uint64_t gk104_builtin_code[] = { 0x4000000100001ea7, 0x480100000c001c03, 0x0800000000105c42, -/* 0x10d8: shared_loop */ +/* 0x1458: shared_loop */ 0xc100000000309c85, 0x9400000500009c85, 0x0c00000010001d02, 0x0800000000105d42, 0x0c0000001030dd02, 0x4003ffff40001ca7, -/* 0x1108: shared_done */ +/* 0x1488: shared_done */ 0x2800406420001de4, 0x2800406430005de4, 0xe000000000001c45, @@ -564,7 +684,7 @@ uint64_t gk104_builtin_code[] = { 0x480000000c209c03, 0x4801000008001c03, 0x0800000000105c42, -/* 0x1170: search_cstack */ +/* 0x14f0: search_cstack */ 0x280040646000dde4, 0x8400000020009f05, 0x190ec0002821dc03, @@ -573,17 +693,17 @@ uint64_t gk104_builtin_code[] = { 0x0800000000105c42, 0x0c0000004030dd02, 0x00029dff0ffc5cbf, -/* 0x11b0: entry_found */ +/* 0x1530: entry_found */ 0x8400000000009f85, 0x2800406400001de4, 0x2800406410005de4, 0x9400000010009c85, 0x4000000000001df4, -/* 0x11d8: end_exit */ +/* 0x1558: end_exit */ 0x9800000003ffdcc5, 0xd000000000008007, 0xa000000000004007, -/* 0x11f0: end_cont */ +/* 0x1570: end_cont */ 0xd000000000008007, 0x3400c3fffc201c04, 0xc000000003f01ec5, @@ -593,6 +713,6 @@ uint64_t gk104_builtin_code[] = { uint64_t gk104_builtin_offsets[] = { 0x0000000000000000, 0x00000000000000f0, - 0x0000000000000f08, 0x0000000000000f18, + 0x0000000000001168, }; diff --git a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm index b9c05a04b9a..4047a565a9f 100644 --- a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm +++ b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm @@ -83,12 +83,229 @@ gk110_div_s32: $p0 sub b32 $r1 $r1 $r2 $p0 add b32 $r0 $r0 0x1 $p3 cvt s32 $r0 neg s32 $r0 - sched 0x04 0x2e 0x04 0x28 0x04 0x20 0x2c + sched 0x04 0x2e 0x28 0x04 0x28 0x28 0x28 $p2 cvt s32 $r1 neg s32 $r1 ret +// RCP F64 +// +// INPUT: $r0d +// OUTPUT: $r0d +// CLOBBER: $r2 - $r9, $p0 +// +// The core of RCP and RSQ implementation is Newton-Raphson step, which is +// used to find successively better approximation from an imprecise initial +// value (single precision rcp in RCP and rsqrt64h in RSQ). +// gk110_rcp_f64: + // Step 1: classify input according to exponent and value, and calculate + // result for 0/inf/nan. $r2 holds the exponent value, which starts at + // bit 52 (bit 20 of the upper half) and is 11 bits in length + ext u32 $r2 $r1 0xb14 + add b32 $r3 $r2 0xffffffff + joinat #rcp_rejoin + // We want to check whether the exponent is 0 or 0x7ff (i.e. NaN, inf, + // denorm, or 0). Do this by substracting 1 from the exponent, which will + // mean that it's > 0x7fd in those cases when doing unsigned comparison + set b32 $p0 0x1 gt u32 $r3 0x7fd + // $r3: 0 for norms, 0x36 for denorms, -1 for others + mov b32 $r3 0x0 + sched 0x2f 0x04 0x2d 0x2b 0x2f 0x28 0x28 + join (not $p0) nop + // Process all special values: NaN, inf, denorm, 0 + mov b32 $r3 0xffffffff + // A number is NaN if its abs value is greater than or unordered with inf + set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000 + (not $p0) bra #rcp_inf_or_denorm_or_zero + // NaN -> NaN, the next line sets the "quiet" bit of the result. This + // behavior is both seen on the CPU and the blob + join or b32 $r1 $r1 0x80000 +rcp_inf_or_denorm_or_zero: + and b32 $r4 $r1 0x7ff00000 + // Other values with nonzero in exponent field should be inf + set b32 $p0 0x1 eq s32 $r4 0x0 + sched 0x2b 0x04 0x2f 0x2d 0x2b 0x2f 0x20 + $p0 bra #rcp_denorm_or_zero + // +/-Inf -> +/-0 + xor b32 $r1 $r1 0x7ff00000 + join mov b32 $r0 0x0 +rcp_denorm_or_zero: + set $p0 0x1 gtu f64 abs $r0d 0x0 + $p0 bra #rcp_denorm + // +/-0 -> +/-Inf + join or b32 $r1 $r1 0x7ff00000 +rcp_denorm: + // non-0 denorms: multiply with 2^54 (the 0x36 in $r3), join with norms + mul rn f64 $r0d $r0d 0x4350000000000000 + sched 0x2f 0x28 0x2b 0x28 0x28 0x04 0x28 + join mov b32 $r3 0x36 +rcp_rejoin: + // All numbers with -1 in $r3 have their result ready in $r0d, return them + // others need further calculation + set b32 $p0 0x1 lt s32 $r3 0x0 + $p0 bra #rcp_end + // Step 2: Before the real calculation goes on, renormalize the values to + // range [1, 2) by setting exponent field to 0x3ff (the exponent of 1) + // result in $r6d. The exponent will be recovered later. + ext u32 $r2 $r1 0xb14 + and b32 $r7 $r1 0x800fffff + add b32 $r7 $r7 0x3ff00000 + mov b32 $r6 $r0 + sched 0x2b 0x04 0x28 0x28 0x2a 0x2b 0x2e + // Step 3: Convert new value to float (no overflow will occur due to step + // 2), calculate rcp and do newton-raphson step once + cvt rz f32 $r5 f64 $r6d + rcp f32 $r4 $r5 + mov b32 $r0 0xbf800000 + fma rn f32 $r5 $r4 $r5 $r0 + fma rn f32 $r0 neg $r4 $r5 $r4 + // Step 4: convert result $r0 back to double, do newton-raphson steps + cvt f64 $r0d f32 $r0 + cvt f64 $r6d f64 neg $r6d + sched 0x2e 0x29 0x29 0x29 0x29 0x29 0x29 + cvt f64 $r8d f32 0x3f800000 + // 4 Newton-Raphson Steps, tmp in $r4d, result in $r0d + // The formula used here (and above) is: + // RCP_{n + 1} = 2 * RCP_{n} - x * RCP_{n} * RCP_{n} + // The following code uses 2 FMAs for each step, and it will basically + // looks like: + // tmp = -src * RCP_{n} + 1 + // RCP_{n + 1} = RCP_{n} * tmp + RCP_{n} + fma rn f64 $r4d $r6d $r0d $r8d + fma rn f64 $r0d $r0d $r4d $r0d + fma rn f64 $r4d $r6d $r0d $r8d + fma rn f64 $r0d $r0d $r4d $r0d + fma rn f64 $r4d $r6d $r0d $r8d + fma rn f64 $r0d $r0d $r4d $r0d + sched 0x29 0x20 0x28 0x28 0x28 0x28 0x28 + fma rn f64 $r4d $r6d $r0d $r8d + fma rn f64 $r0d $r0d $r4d $r0d + // Step 5: Exponent recovery and final processing + // The exponent is recovered by adding what we added to the exponent. + // Suppose we want to calculate rcp(x), but we have rcp(cx), then + // rcp(x) = c * rcp(cx) + // The delta in exponent comes from two sources: + // 1) The renormalization in step 2. The delta is: + // 0x3ff - $r2 + // 2) (For the denorm input) The 2^54 we multiplied at rcp_denorm, stored + // in $r3 + // These 2 sources are calculated in the first two lines below, and then + // added to the exponent extracted from the result above. + // Note that after processing, the new exponent may >= 0x7ff (inf) + // or <= 0 (denorm). Those cases will be handled respectively below + subr b32 $r2 $r2 0x3ff + add b32 $r4 $r2 $r3 + ext u32 $r3 $r1 0xb14 + // New exponent in $r3 + add b32 $r3 $r3 $r4 + add b32 $r2 $r3 0xffffffff + sched 0x28 0x2b 0x28 0x2b 0x28 0x28 0x2b + // (exponent-1) < 0x7fe (unsigned) means the result is in norm range + // (same logic as in step 1) + set b32 $p0 0x1 lt u32 $r2 0x7fe + (not $p0) bra #rcp_result_inf_or_denorm + // Norms: convert exponents back and return + shl b32 $r4 $r4 clamp 0x14 + add b32 $r1 $r4 $r1 + bra #rcp_end +rcp_result_inf_or_denorm: + // New exponent >= 0x7ff means that result is inf + set b32 $p0 0x1 ge s32 $r3 0x7ff + (not $p0) bra #rcp_result_denorm + sched 0x20 0x25 0x28 0x2b 0x23 0x25 0x2f + // Infinity + and b32 $r1 $r1 0x80000000 + mov b32 $r0 0x0 + add b32 $r1 $r1 0x7ff00000 + bra #rcp_end +rcp_result_denorm: + // Denorm result comes from huge input. The greatest possible fp64, i.e. + // 0x7fefffffffffffff's rcp is 0x0004000000000000, 1/4 of the smallest + // normal value. Other rcp result should be greater than that. If we + // set the exponent field to 1, we can recover the result by multiplying + // it with 1/2 or 1/4. 1/2 is used if the "exponent" $r3 is 0, otherwise + // 1/4 ($r3 should be -1 then). This is quite tricky but greatly simplifies + // the logic here. + set b32 $p0 0x1 ne u32 $r3 0x0 + and b32 $r1 $r1 0x800fffff + // 0x3e800000: 1/4 + $p0 cvt f64 $r6d f32 0x3e800000 + sched 0x2f 0x28 0x2c 0x2e 0x2a 0x20 0x27 + // 0x3f000000: 1/2 + (not $p0) cvt f64 $r6d f32 0x3f000000 + add b32 $r1 $r1 0x00100000 + mul rn f64 $r0d $r0d $r6d +rcp_end: + ret + +// RSQ F64 +// +// INPUT: $r0d +// OUTPUT: $r0d +// CLOBBER: $r2 - $r9, $p0 - $p1 +// gk110_rsq_f64: + // Before getting initial result rsqrt64h, two special cases should be + // handled first. + // 1. NaN: set the highest bit in mantissa so it'll be surely recognized + // as NaN in rsqrt64h + set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000 + $p0 or b32 $r1 $r1 0x00080000 + and b32 $r2 $r1 0x7fffffff + sched 0x27 0x20 0x28 0x2c 0x25 0x28 0x28 + // 2. denorms and small normal values: using their original value will + // lose precision either at rsqrt64h or the first step in newton-raphson + // steps below. Take 2 as a threshold in exponent field, and multiply + // with 2^54 if the exponent is smaller or equal. (will multiply 2^27 + // to recover in the end) + ext u32 $r3 $r1 0xb14 + set b32 $p1 0x1 le u32 $r3 0x2 + or b32 $r2 $r0 $r2 + $p1 mul rn f64 $r0d $r0d 0x4350000000000000 + rsqrt64h f32 $r5 $r1 + // rsqrt64h will give correct result for 0/inf/nan, the following logic + // checks whether the input is one of those (exponent is 0x7ff or all 0 + // except for the sign bit) + set b32 $r6 ne u32 $r3 0x7ff + and b32 $r2 $r2 $r6 + sched 0x28 0x2b 0x20 0x27 0x28 0x2e 0x28 + set b32 $p0 0x1 ne u32 $r2 0x0 + $p0 bra #rsq_norm + // For 0/inf/nan, make sure the sign bit agrees with input and return + and b32 $r1 $r1 0x80000000 + mov b32 $r0 0x0 + or b32 $r1 $r1 $r5 + ret +rsq_norm: + // For others, do 4 Newton-Raphson steps with the formula: + // RSQ_{n + 1} = RSQ_{n} * (1.5 - 0.5 * x * RSQ_{n} * RSQ_{n}) + // In the code below, each step is written as: + // tmp1 = 0.5 * x * RSQ_{n} + // tmp2 = -RSQ_{n} * tmp1 + 0.5 + // RSQ_{n + 1} = RSQ_{n} * tmp2 + RSQ_{n} + mov b32 $r4 0x0 + sched 0x2f 0x29 0x29 0x29 0x29 0x29 0x29 + // 0x3f000000: 1/2 + cvt f64 $r8d f32 0x3f000000 + mul rn f64 $r2d $r0d $r8d + mul rn f64 $r0d $r2d $r4d + fma rn f64 $r6d neg $r4d $r0d $r8d + fma rn f64 $r4d $r4d $r6d $r4d + mul rn f64 $r0d $r2d $r4d + fma rn f64 $r6d neg $r4d $r0d $r8d + sched 0x29 0x29 0x29 0x29 0x29 0x29 0x29 + fma rn f64 $r4d $r4d $r6d $r4d + mul rn f64 $r0d $r2d $r4d + fma rn f64 $r6d neg $r4d $r0d $r8d + fma rn f64 $r4d $r4d $r6d $r4d + mul rn f64 $r0d $r2d $r4d + fma rn f64 $r6d neg $r4d $r0d $r8d + fma rn f64 $r4d $r4d $r6d $r4d + sched 0x29 0x20 0x28 0x2e 0x00 0x00 0x00 + // Multiply 2^27 to result for small inputs to recover + $p1 mul rn f64 $r4d $r4d 0x41a0000000000000 + mov b32 $r1 $r5 + mov b32 $r0 $r4 ret .section #gk110_builtin_offsets diff --git a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h index 8d00e2a2245..3d1523f2fdd 100644 --- a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h +++ b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h @@ -65,11 +65,132 @@ uint64_t gk110_builtin_code[] = { 0xe088000001000406, 0x4000000000800001, 0xe6010000000ce802, - 0x08b08010a010b810, + 0x08a0a0a010a0b810, 0xe60100000088e806, 0x19000000001c003c, /* 0x0218: gk110_rcp_f64 */ -/* 0x0218: gk110_rsq_f64 */ + 0xc00000058a1c0409, + 0x407fffffff9c080d, + 0x1480000050000000, + 0xb3401c03fe9c0c1d, + 0xe4c03c007f9c000e, + 0x08a0a0bcacb410bc, + 0x8580000000603c02, + 0x747fffffff9fc00e, + 0xb4601fff801c021d, + 0x120000000420003c, + 0x21000400005c0404, +/* 0x0270: rcp_inf_or_denorm_or_zero */ + 0x203ff800001c0410, + 0xb3281c00001c101d, + 0x0880bcacb4bc10ac, + 0x120000000800003c, + 0x223ff800001c0404, + 0xe4c03c007fdc0002, +/* 0x02a0: rcp_denorm_or_zero */ + 0xb4601c00001c021d, + 0x120000000400003c, + 0x213ff800005c0404, +/* 0x02b8: rcp_denorm */ + 0xc400021a801c0001, + 0x08a010a0a0aca0bc, + 0x740000001b5fc00e, +/* 0x02d0: rcp_rejoin */ + 0xb3181c00001c0c1d, + 0x12000000c000003c, + 0xc00000058a1c0409, + 0x204007ffff9c041c, + 0x401ff800001c1c1d, + 0xe4c03c00001c001a, + 0x08b8aca8a0a010ac, + 0xe5400c00031c3816, + 0x84000000021c1412, + 0x745fc000001fc002, + 0xcc000000029c1016, + 0xcc081000029c1002, + 0xe5400000001c2c02, + 0xe5410000031c3c1a, + 0x08a4a4a4a4a4a4b8, + 0xc54001fc001c2c21, + 0xdb802000001c1812, + 0xdb800000021c0002, + 0xdb802000001c1812, + 0xdb800000021c0002, + 0xdb802000001c1812, + 0xdb800000021c0002, + 0x08a0a0a0a0a080a4, + 0xdb802000001c1812, + 0xdb800000021c0002, + 0x48000001ff9c0809, + 0xe0800000019c0812, + 0xc00000058a1c040d, + 0xe0800000021c0c0e, + 0x407fffffff9c0c09, + 0x08aca0a0aca0aca0, + 0xb3101c03ff1c081d, + 0x120000000c20003c, + 0xc24000000a1c1011, + 0xe0800000009c1006, + 0x12000000381c003c, +/* 0x03f0: rcp_result_inf_or_denorm */ + 0xb3681c03ff9c0c1d, + 0x120000001420003c, + 0x08bc948caca09480, + 0x20400000001c0404, + 0xe4c03c007f9c0002, + 0x403ff800001c0405, + 0x120000001c1c003c, +/* 0x0428: rcp_result_denorm */ + 0xb3501c00001c0c1d, + 0x204007ffff9c0404, + 0xc54001f400002c19, + 0x089c80a8b8b0a0bc, + 0xc54001f800202c19, + 0x40000800001c0405, + 0xe4000000031c0002, +/* 0x0460: rcp_end */ + 0x19000000001c003c, +/* 0x0468: gk110_rsq_f64 */ + 0xb4601fff801c021d, + 0x2100040000000404, + 0x203fffffff9c0408, + 0x08a0a094b0a0809c, + 0xc00000058a1c040d, + 0xb3301c00011c0c3d, + 0xe2001000011c000a, + 0xc400021a80040001, + 0x84000000039c0416, + 0xb2d01c03ff9c0c19, + 0xe2000000031c080a, + 0x08a0b8a09c80aca0, + 0xb3501c00001c081d, + 0x120000001000003c, + 0x20400000001c0404, + 0xe4c03c007f9c0002, + 0xe2001000029c0406, + 0x19000000001c003c, +/* 0x04f8: rsq_norm */ + 0xe4c03c007f9c0012, + 0x08a4a4a4a4a4a4bc, + 0xc54001f8001c2c21, + 0xe4000000041c000a, + 0xe4000000021c0802, + 0xdb882000001c101a, + 0xdb801000031c1012, + 0xe4000000021c0802, + 0xdb882000001c101a, + 0x08a4a4a4a4a4a4a4, + 0xdb801000031c1012, + 0xe4000000021c0802, + 0xdb882000001c101a, + 0xdb801000031c1012, + 0xe4000000021c0802, + 0xdb882000001c101a, + 0xdb801000031c1012, + 0x08000000b8a080a4, + 0xc400020d00041011, + 0xe4c03c00029c0006, + 0xe4c03c00021c0002, 0x19000000001c003c, }; @@ -77,5 +198,5 @@ uint64_t gk110_builtin_offsets[] = { 0x0000000000000000, 0x00000000000000f0, 0x0000000000000218, - 0x0000000000000218, + 0x0000000000000468, }; diff --git a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm index 7ee5f8fc65b..faee0218d18 100644 --- a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm +++ b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm @@ -100,10 +100,253 @@ gm107_div_s32: ret nop 0 -// STUB +// RCP F64 +// +// INPUT: $r0d +// OUTPUT: $r0d +// CLOBBER: $r2 - $r9, $p0 +// +// The core of RCP and RSQ implementation is Newton-Raphson step, which is +// used to find successively better approximation from an imprecise initial +// value (single precision rcp in RCP and rsqrt64h in RSQ). +// gm107_rcp_f64: -gm107_rsq_f64: + // Step 1: classify input according to exponent and value, and calculate + // result for 0/inf/nan. $r2 holds the exponent value, which starts at + // bit 52 (bit 20 of the upper half) and is 11 bits in length + sched (st 0x0) (st 0x0) (st 0x0) + bfe u32 $r2 $r1 0xb14 + iadd32i $r3 $r2 -1 + ssy #rcp_rejoin + // We want to check whether the exponent is 0 or 0x7ff (i.e. NaN, inf, + // denorm, or 0). Do this by substracting 1 from the exponent, which will + // mean that it's > 0x7fd in those cases when doing unsigned comparison + sched (st 0x0) (st 0x0) (st 0x0) + isetp gt u32 and $p0 1 $r3 0x7fd 1 + // $r3: 0 for norms, 0x36 for denorms, -1 for others + mov $r3 0x0 0xf + not $p0 sync + // Process all special values: NaN, inf, denorm, 0 + sched (st 0x0) (st 0x0) (st 0x0) + mov32i $r3 0xffffffff 0xf + // A number is NaN if its abs value is greater than or unordered with inf + dsetp gtu and $p0 1 abs $r0 0x7ff0000000000000 1 + not $p0 bra #rcp_inf_or_denorm_or_zero + // NaN -> NaN, the next line sets the "quiet" bit of the result. This + // behavior is both seen on the CPU and the blob + sched (st 0x0) (st 0x0) (st 0x0) + lop32i or $r1 $r1 0x80000 + sync +rcp_inf_or_denorm_or_zero: + lop32i and $r4 $r1 0x7ff00000 + sched (st 0x0) (st 0x0) (st 0x0) + // Other values with nonzero in exponent field should be inf + isetp eq and $p0 1 $r4 0x0 1 + $p0 bra #rcp_denorm_or_zero + // +/-Inf -> +/-0 + lop32i xor $r1 $r1 0x7ff00000 + sched (st 0x0) (st 0x0) (st 0x0) + mov $r0 0x0 0xf + sync +rcp_denorm_or_zero: + dsetp gtu and $p0 1 abs $r0 0x0 1 + sched (st 0x0) (st 0x0) (st 0x0) + $p0 bra #rcp_denorm + // +/-0 -> +/-Inf + lop32i or $r1 $r1 0x7ff00000 + sync +rcp_denorm: + // non-0 denorms: multiply with 2^54 (the 0x36 in $r3), join with norms + sched (st 0x0) (st 0x0) (st 0x0) + dmul $r0 $r0 0x4350000000000000 + mov $r3 0x36 0xf + sync +rcp_rejoin: + // All numbers with -1 in $r3 have their result ready in $r0d, return them + // others need further calculation + sched (st 0x0) (st 0x0) (st 0x0) + isetp lt and $p0 1 $r3 0x0 1 + $p0 bra #rcp_end + // Step 2: Before the real calculation goes on, renormalize the values to + // range [1, 2) by setting exponent field to 0x3ff (the exponent of 1) + // result in $r6d. The exponent will be recovered later. + bfe u32 $r2 $r1 0xb14 + sched (st 0x0) (st 0x0) (st 0x0) + lop32i and $r7 $r1 0x800fffff + iadd32i $r7 $r7 0x3ff00000 + mov $r6 $r0 0xf + // Step 3: Convert new value to float (no overflow will occur due to step + // 2), calculate rcp and do newton-raphson step once + sched (st 0x0) (st 0x0) (st 0x0) + f2f ftz f64 f32 $r5 $r6 + mufu rcp $r4 $r5 + mov32i $r0 0xbf800000 0xf + sched (st 0x0) (st 0x0) (st 0x0) + ffma $r5 $r4 $r5 $r0 + ffma $r0 $r5 neg $r4 $r4 + // Step 4: convert result $r0 back to double, do newton-raphson steps + f2f f32 f64 $r0 $r0 + sched (st 0x0) (st 0x0) (st 0x0) + f2f f64 f64 $r6 neg $r6 + f2f f32 f64 $r8 0x3f800000 + // 4 Newton-Raphson Steps, tmp in $r4d, result in $r0d + // The formula used here (and above) is: + // RCP_{n + 1} = 2 * RCP_{n} - x * RCP_{n} * RCP_{n} + // The following code uses 2 FMAs for each step, and it will basically + // looks like: + // tmp = -src * RCP_{n} + 1 + // RCP_{n + 1} = RCP_{n} * tmp + RCP_{n} + dfma $r4 $r6 $r0 $r8 sched (st 0x0) (st 0x0) (st 0x0) + dfma $r0 $r0 $r4 $r0 + dfma $r4 $r6 $r0 $r8 + dfma $r0 $r0 $r4 $r0 + sched (st 0x0) (st 0x0) (st 0x0) + dfma $r4 $r6 $r0 $r8 + dfma $r0 $r0 $r4 $r0 + dfma $r4 $r6 $r0 $r8 + sched (st 0x0) (st 0x0) (st 0x0) + dfma $r0 $r0 $r4 $r0 + // Step 5: Exponent recovery and final processing + // The exponent is recovered by adding what we added to the exponent. + // Suppose we want to calculate rcp(x), but we have rcp(cx), then + // rcp(x) = c * rcp(cx) + // The delta in exponent comes from two sources: + // 1) The renormalization in step 2. The delta is: + // 0x3ff - $r2 + // 2) (For the denorm input) The 2^54 we multiplied at rcp_denorm, stored + // in $r3 + // These 2 sources are calculated in the first two lines below, and then + // added to the exponent extracted from the result above. + // Note that after processing, the new exponent may >= 0x7ff (inf) + // or <= 0 (denorm). Those cases will be handled respectively below + iadd $r2 neg $r2 0x3ff + iadd $r4 $r2 $r3 + sched (st 0x0) (st 0x0) (st 0x0) + bfe u32 $r3 $r1 0xb14 + // New exponent in $r3 + iadd $r3 $r3 $r4 + iadd32i $r2 $r3 -1 + // (exponent-1) < 0x7fe (unsigned) means the result is in norm range + // (same logic as in step 1) + sched (st 0x0) (st 0x0) (st 0x0) + isetp lt u32 and $p0 1 $r2 0x7fe 1 + not $p0 bra #rcp_result_inf_or_denorm + // Norms: convert exponents back and return + shl $r4 $r4 0x14 + sched (st 0x0) (st 0x0) (st 0x0) + iadd $r1 $r4 $r1 + bra #rcp_end +rcp_result_inf_or_denorm: + // New exponent >= 0x7ff means that result is inf + isetp ge and $p0 1 $r3 0x7ff 1 + sched (st 0x0) (st 0x0) (st 0x0) + not $p0 bra #rcp_result_denorm + // Infinity + lop32i and $r1 $r1 0x80000000 + mov $r0 0x0 0xf + sched (st 0x0) (st 0x0) (st 0x0) + iadd32i $r1 $r1 0x7ff00000 + bra #rcp_end +rcp_result_denorm: + // Denorm result comes from huge input. The greatest possible fp64, i.e. + // 0x7fefffffffffffff's rcp is 0x0004000000000000, 1/4 of the smallest + // normal value. Other rcp result should be greater than that. If we + // set the exponent field to 1, we can recover the result by multiplying + // it with 1/2 or 1/4. 1/2 is used if the "exponent" $r3 is 0, otherwise + // 1/4 ($r3 should be -1 then). This is quite tricky but greatly simplifies + // the logic here. + isetp ne u32 and $p0 1 $r3 0x0 1 + sched (st 0x0) (st 0x0) (st 0x0) + lop32i and $r1 $r1 0x800fffff + // 0x3e800000: 1/4 + $p0 f2f f32 f64 $r6 0x3e800000 + // 0x3f000000: 1/2 + not $p0 f2f f32 f64 $r6 0x3f000000 + sched (st 0x0) (st 0x0) (st 0x0) + iadd32i $r1 $r1 0x00100000 + dmul $r0 $r0 $r6 +rcp_end: + ret + +// RSQ F64 +// +// INPUT: $r0d +// OUTPUT: $r0d +// CLOBBER: $r2 - $r9, $p0 - $p1 +// +gm107_rsq_f64: + // Before getting initial result rsqrt64h, two special cases should be + // handled first. + // 1. NaN: set the highest bit in mantissa so it'll be surely recognized + // as NaN in rsqrt64h + sched (st 0xd wr 0x0 wt 0x3f) (st 0xd wt 0x1) (st 0xd) + dsetp gtu and $p0 1 abs $r0 0x7ff0000000000000 1 + $p0 lop32i or $r1 $r1 0x00080000 + lop32i and $r2 $r1 0x7fffffff + // 2. denorms and small normal values: using their original value will + // lose precision either at rsqrt64h or the first step in newton-raphson + // steps below. Take 2 as a threshold in exponent field, and multiply + // with 2^54 if the exponent is smaller or equal. (will multiply 2^27 + // to recover in the end) + sched (st 0xd) (st 0xd) (st 0xd) + bfe u32 $r3 $r1 0xb14 + isetp le u32 and $p1 1 $r3 0x2 1 + lop or 1 $r2 $r0 $r2 + sched (st 0xd wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xd) + $p1 dmul $r0 $r0 0x4350000000000000 + mufu rsq64h $r5 $r1 + // rsqrt64h will give correct result for 0/inf/nan, the following logic + // checks whether the input is one of those (exponent is 0x7ff or all 0 + // except for the sign bit) + iset ne u32 and $r6 $r3 0x7ff 1 + sched (st 0xd) (st 0xd) (st 0xd) + lop and 1 $r2 $r2 $r6 + isetp ne u32 and $p0 1 $r2 0x0 1 + $p0 bra #rsq_norm + // For 0/inf/nan, make sure the sign bit agrees with input and return + sched (st 0xd) (st 0xd) (st 0xd wt 0x1) + lop32i and $r1 $r1 0x80000000 + mov $r0 0x0 0xf + lop or 1 $r1 $r1 $r5 + sched (st 0xd) (st 0xf) (st 0xf) + ret + nop 0 + nop 0 +rsq_norm: + // For others, do 4 Newton-Raphson steps with the formula: + // RSQ_{n + 1} = RSQ_{n} * (1.5 - 0.5 * x * RSQ_{n} * RSQ_{n}) + // In the code below, each step is written as: + // tmp1 = 0.5 * x * RSQ_{n} + // tmp2 = -RSQ_{n} * tmp1 + 0.5 + // RSQ_{n + 1} = RSQ_{n} * tmp2 + RSQ_{n} + sched (st 0xd) (st 0xd wr 0x1) (st 0xd wr 0x1 rd 0x0 wt 0x3) + mov $r4 0x0 0xf + // 0x3f000000: 1/2 + f2f f32 f64 $r8 0x3f000000 + dmul $r2 $r0 $r8 + sched (st 0xd wr 0x0 wt 0x3) (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) + dmul $r0 $r2 $r4 + dfma $r6 $r0 neg $r4 $r8 + dfma $r4 $r4 $r6 $r4 + sched (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) + dmul $r0 $r2 $r4 + dfma $r6 $r0 neg $r4 $r8 + dfma $r4 $r4 $r6 $r4 + sched (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) + dmul $r0 $r2 $r4 + dfma $r6 $r0 neg $r4 $r8 + dfma $r4 $r4 $r6 $r4 + sched (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) + dmul $r0 $r2 $r4 + dfma $r6 $r0 neg $r4 $r8 + dfma $r4 $r4 $r6 $r4 + // Multiply 2^27 to result for small inputs to recover + sched (st 0xd wr 0x0 wt 0x1) (st 0xd wt 0x1) (st 0xd) + $p1 dmul $r4 $r4 0x41a0000000000000 + mov $r1 $r5 0xf + mov $r0 $r4 0xf + sched (st 0xd) (st 0xf) (st 0xf) ret nop 0 nop 0 diff --git a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h index 65c93f7ae89..8eb27bbac99 100644 --- a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h +++ b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h @@ -82,8 +82,156 @@ uint64_t gm107_builtin_code[] = { 0xe32000000007000f, 0x50b0000000070f00, /* 0x0280: gm107_rcp_f64 */ -/* 0x0280: gm107_rsq_f64 */ 0x001f8000fc0007e0, + 0x38000000b1470102, + 0x1c0ffffffff70203, + 0xe29000000e000000, + 0x001f8000fc0007e0, + 0x366803807fd70307, + 0x5c9807800ff70003, + 0xf0f800000008000f, + 0x001f8000fc0007e0, + 0x010ffffffff7f003, + 0x368c03fff0070087, + 0xe24000000188000f, + 0x001f8000fc0007e0, + 0x0420008000070101, + 0xf0f800000007000f, +/* 0x02f8: rcp_inf_or_denorm_or_zero */ + 0x0407ff0000070104, + 0x001f8000fc0007e0, + 0x5b6503800ff70407, + 0xe24000000200000f, + 0x0447ff0000070101, + 0x001f8000fc0007e0, + 0x5c9807800ff70000, + 0xf0f800000007000f, +/* 0x0338: rcp_denorm_or_zero */ + 0x5b8c03800ff70087, + 0x001f8000fc0007e0, + 0xe24000000100000f, + 0x0427ff0000070101, + 0xf0f800000007000f, +/* 0x0360: rcp_denorm */ + 0x001f8000fc0007e0, + 0x3880004350070000, + 0x3898078003670003, + 0xf0f800000007000f, +/* 0x0380: rcp_rejoin */ + 0x001f8000fc0007e0, + 0x5b6303800ff70307, + 0xe24000001c00000f, + 0x38000000b1470102, + 0x001f8000fc0007e0, + 0x040800fffff70107, + 0x1c03ff0000070707, + 0x5c98078000070006, + 0x001f8000fc0007e0, + 0x5ca8100000670e05, + 0x5080000000470504, + 0x010bf8000007f000, + 0x001f8000fc0007e0, + 0x5980000000570405, + 0x5981020000470500, + 0x5ca8000000070b00, + 0x001f8000fc0007e0, + 0x5ca8200000670f06, + 0x38a8003f80070b08, + 0x5b70040000070604, + 0x001f8000fc0007e0, + 0x5b70000000470000, + 0x5b70040000070604, + 0x5b70000000470000, + 0x001f8000fc0007e0, + 0x5b70040000070604, + 0x5b70000000470000, + 0x5b70040000070604, + 0x001f8000fc0007e0, + 0x5b70000000470000, + 0x381200003ff70202, + 0x5c10000000370204, + 0x001f8000fc0007e0, + 0x38000000b1470103, + 0x5c10000000470303, + 0x1c0ffffffff70302, + 0x001f8000fc0007e0, + 0x366203807fe70207, + 0xe24000000208000f, + 0x3848000001470404, + 0x001f8000fc0007e0, + 0x5c10000000170401, + 0xe24000000807000f, +/* 0x04d8: rcp_result_inf_or_denorm */ + 0x366d03807ff70307, + 0x001f8000fc0007e0, + 0xe24000000288000f, + 0x0408000000070101, + 0x5c9807800ff70000, + 0x001f8000fc0007e0, + 0x1c07ff0000070101, + 0xe24000000407000f, +/* 0x0518: rcp_result_denorm */ + 0x5b6a03800ff70307, + 0x001f8000fc0007e0, + 0x040800fffff70101, + 0x38a8003e80000b06, + 0x38a8003f00080b06, + 0x001f8000fc0007e0, + 0x1c00010000070101, + 0x5c80000000670000, +/* 0x0558: rcp_end */ + 0xe32000000007000f, +/* 0x0560: gm107_rsq_f64 */ + 0x001fb401fda1ff0d, + 0x368c03fff0070087, + 0x0420008000000101, + 0x0407fffffff70102, + 0x001fb400fda007ed, + 0x38000000b1470103, + 0x366603800027030f, + 0x5c47020000270002, + 0x001fb401e1a0070d, + 0x3880004350010000, + 0x5080000000770105, + 0x365a03807ff70306, + 0x001fb400fda007ed, + 0x5c47000000670202, + 0x5b6a03800ff70207, + 0xe24000000400000f, + 0x003fb400fda007ed, + 0x0408000000070101, + 0x5c9807800ff70000, + 0x5c47020000570101, + 0x001fbc00fde007ed, + 0xe32000000007000f, + 0x50b0000000070f00, + 0x50b0000000070f00, +/* 0x0620: rsq_norm */ + 0x0060b400e5a007ed, + 0x5c9807800ff70004, + 0x38a8003f00070b08, + 0x5c80000000870002, + 0x003c3401e1a01f0d, + 0x5c80000000470200, + 0x5b71040000470006, + 0x5b70020000670404, + 0x003c3401e1a00f0d, + 0x5c80000000470200, + 0x5b71040000470006, + 0x5b70020000670404, + 0x003c3401e1a00f0d, + 0x5c80000000470200, + 0x5b71040000470006, + 0x5b70020000670404, + 0x003c3401e1a00f0d, + 0x5c80000000470200, + 0x5b71040000470006, + 0x5b70020000670404, + 0x001fb401fda00f0d, + 0x38800041a0010404, + 0x5c98078000570001, + 0x5c98078000470000, + 0x001fbc00fde007ed, 0xe32000000007000f, 0x50b0000000070f00, 0x50b0000000070f00, @@ -93,5 +241,5 @@ uint64_t gm107_builtin_offsets[] = { 0x0000000000000000, 0x0000000000000120, 0x0000000000000280, - 0x0000000000000280, + 0x0000000000000560, }; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp index 49425b98b91..993d01c1e44 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp @@ -1119,6 +1119,7 @@ Program::Program(Type type, Target *arch) binSize = 0; maxGPR = -1; + fp64 = false; main = new Function(this, "MAIN", ~0); calls.insert(&main->call); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h index 8085bb2f542..8d32a25ec23 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h @@ -1311,6 +1311,7 @@ class Program uint32_t tlsSize; // size required for FILE_MEMORY_LOCAL int maxGPR; + bool fp64; MemoryPool mem_Instruction; MemoryPool mem_CmpInstruction; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp index afd7916a321..335e708c5cb 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp @@ -1087,6 +1087,8 @@ class Source }; std::vector memoryFiles; + std::vector bufferAtomics; + private: int inferSysValDirection(unsigned sn) const; bool scanDeclaration(const struct tgsi_full_declaration *); @@ -1137,6 +1139,7 @@ bool Source::scanSource() //resources.resize(scan.file_max[TGSI_FILE_RESOURCE] + 1); tempArrayId.resize(scan.file_max[TGSI_FILE_TEMPORARY] + 1); memoryFiles.resize(scan.file_max[TGSI_FILE_MEMORY] + 1); + bufferAtomics.resize(scan.file_max[TGSI_FILE_BUFFER] + 1); info->immd.bufSize = 0; @@ -1483,11 +1486,14 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl) tempArrayInfo.insert(std::make_pair(arrayId, std::make_pair( first, last - first + 1))); break; + case TGSI_FILE_BUFFER: + for (i = first; i <= last; ++i) + bufferAtomics[i] = decl->Declaration.Atomic; + break; case TGSI_FILE_ADDRESS: case TGSI_FILE_CONSTANT: case TGSI_FILE_IMMEDIATE: case TGSI_FILE_SAMPLER: - case TGSI_FILE_BUFFER: case TGSI_FILE_IMAGE: break; default: @@ -2720,7 +2726,11 @@ Converter::handleLOAD(Value *dst0[4]) } Instruction *ld = mkLoad(TYPE_U32, dst0[c], sym, off); - ld->cache = tgsi.getCacheMode(); + if (tgsi.getSrc(0).getFile() == TGSI_FILE_BUFFER && + code->bufferAtomics[r]) + ld->cache = nv50_ir::CACHE_CG; + else + ld->cache = tgsi.getCacheMode(); if (ind) ld->setIndirect(0, 1, ind); } diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp index 295497be2f9..346a98228bd 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp @@ -83,6 +83,38 @@ NVC0LegalizeSSA::handleDIV(Instruction *i) delete_Instruction(prog, i); } +void +NVC0LegalizeSSA::handleRCPRSQLib(Instruction *i, Value *src[]) +{ + FlowInstruction *call; + Value *def[2]; + int builtin; + + def[0] = bld.mkMovToReg(0, src[0])->getDef(0); + def[1] = bld.mkMovToReg(1, src[1])->getDef(0); + + if (i->op == OP_RCP) + builtin = NVC0_BUILTIN_RCP_F64; + else + builtin = NVC0_BUILTIN_RSQ_F64; + + call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL); + def[0] = bld.getSSA(); + def[1] = bld.getSSA(); + bld.mkMovFromReg(def[0], 0); + bld.mkMovFromReg(def[1], 1); + bld.mkClobber(FILE_GPR, 0x3fc, 2); + bld.mkClobber(FILE_PREDICATE, i->op == OP_RSQ ? 0x3 : 0x1, 0); + bld.mkOp2(OP_MERGE, TYPE_U64, i->getDef(0), def[0], def[1]); + + call->fixed = 1; + call->absolute = call->builtin = 1; + call->target.builtin = builtin; + delete_Instruction(prog, i); + + prog->fp64 = true; +} + void NVC0LegalizeSSA::handleRCPRSQ(Instruction *i) { @@ -96,6 +128,12 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i) Value *src[2], *dst[2], *def = i->getDef(0); bld.mkSplit(src, 4, i->getSrc(0)); + int chip = prog->getTarget()->getChipset(); + if (chip >= NVISA_GK104_CHIPSET) { + handleRCPRSQLib(i, src); + return; + } + // 2. We don't care about the low 32 bits of the destination. Stick a 0 in. dst[0] = bld.loadImm(NULL, 0); dst[1] = bld.getSSA(); @@ -1063,22 +1101,6 @@ NVC0LoweringPass::handleTEX(TexInstruction *i) } } - if (chipset >= NVISA_GK104_CHIPSET) { - // - // If TEX requires more than 4 sources, the 2nd register tuple must be - // aligned to 4, even if it consists of just a single 4-byte register. - // - // XXX HACK: We insert 0 sources to avoid the 5 or 6 regs case. - // - int s = i->srcCount(0xff, true); - if (s > 4 && s < 7) { - if (i->srcExists(s)) // move potential predicate out of the way - i->moveSources(s, 7 - s); - while (s < 7) - i->setSrc(s++, bld.loadImm(NULL, 0)); - } - } - return true; } @@ -1887,7 +1909,8 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su) su->op == OP_SULDB || su->op == OP_SUSTB || su->op == OP_SUREDB; const int slot = su->tex.r; const int dim = su->tex.target.getDim(); - const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube()); + const bool array = su->tex.target.isArray() || su->tex.target.isCube(); + const int arg = dim + array; int c; Value *zero = bld.mkImm(0); Value *p1 = NULL; @@ -1896,6 +1919,7 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su) Value *bf, *eau, *off; Value *addr, *pred; Value *ind = su->getIndirectR(); + Value *y, *z; off = bld.getScratch(4); bf = bld.getScratch(4); @@ -1926,34 +1950,42 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su) for (; c < 3; ++c) src[c] = zero; + if (dim == 2 && !array) { + v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless); + src[2] = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(), + v, bld.loadImm(NULL, 16)); + + v = loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM(2), su->tex.bindless); + bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[2], src[2], v, zero) + ->subOp = NV50_IR_SUBOP_SUCLAMP_SD(0, 2); + } + // set predicate output if (su->tex.target == TEX_TARGET_BUFFER) { src[0]->getInsn()->setFlagsDef(1, pred); } else - if (su->tex.target.isArray() || su->tex.target.isCube()) { + if (array) { p1 = bld.getSSA(1, FILE_PREDICATE); src[dim]->getInsn()->setFlagsDef(1, p1); } // calculate pixel offset if (dim == 1) { + y = z = zero; if (su->tex.target != TEX_TARGET_BUFFER) bld.mkOp2(OP_AND, TYPE_U32, off, src[0], bld.loadImm(NULL, 0xffff)); - } else - if (dim == 3) { + } else { + y = src[1]; + z = src[2]; + v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless); bld.mkOp3(OP_MADSP, TYPE_U32, off, src[2], v, src[1]) - ->subOp = NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l + ->subOp = NV50_IR_SUBOP_MADSP(4,4,8); // u16l u16l u16l v = loadSuInfo32(ind, slot, NVC0_SU_INFO_PITCH, su->tex.bindless); bld.mkOp3(OP_MADSP, TYPE_U32, off, off, v, src[0]) - ->subOp = NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l - } else { - assert(dim == 2); - v = loadSuInfo32(ind, slot, NVC0_SU_INFO_PITCH, su->tex.bindless); - bld.mkOp3(OP_MADSP, TYPE_U32, off, src[1], v, src[0]) - ->subOp = (su->tex.target.isArray() || su->tex.target.isCube()) ? - NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l + ->subOp = array ? + NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l } // calculate effective address part 1 @@ -1966,19 +1998,15 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su) ->subOp = NV50_IR_SUBOP_V1(7,6,8|2); } } else { - Value *y = src[1]; - Value *z = src[2]; uint16_t subOp = 0; switch (dim) { case 1: - y = zero; - z = zero; break; case 2: - z = off; - if (!su->tex.target.isArray() && !su->tex.target.isCube()) { - z = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless); + if (array) { + z = off; + } else { subOp = NV50_IR_SUBOP_SUBFM_3D; } break; @@ -2001,7 +2029,7 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su) eau = bld.mkOp3v(OP_SUEAU, TYPE_U32, bld.getScratch(4), off, bf, v); } // add array layer offset - if (su->tex.target.isArray() || su->tex.target.isCube()) { + if (array) { v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ARRAY, su->tex.bindless); if (dim == 1) bld.mkOp3(OP_MADSP, TYPE_U32, eau, src[1], v, eau) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h index e0f50ab0904..99809726602 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h @@ -62,6 +62,7 @@ class NVC0LegalizeSSA : public Pass // we want to insert calls to the builtin library only after optimization void handleDIV(Instruction *); // integer division, modulus + void handleRCPRSQLib(Instruction *, Value *[]); void handleRCPRSQ(Instruction *); // double precision float recip/rsqrt void handleFTZ(Instruction *); void handleSET(CmpInstruction *); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp index f4379c137c5..f25bce00884 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp @@ -2341,9 +2341,19 @@ RegAlloc::InsertConstraintsPass::texConstraintGM107(TexInstruction *tex) if (!tex->tex.target.isArray() && tex->tex.useOffsets) s++; } - n = tex->srcCount(0xff) - s; + n = tex->srcCount(0xff, true) - s; + // TODO: Is this necessary? Perhaps just has to be aligned to the + // level that the first arg is, not necessarily to 4. This + // requirement has not been rigorously verified, as it has been on + // Kepler. + if (n > 0 && n < 3) { + if (tex->srcExists(n + s)) // move potential predicate out of the way + tex->moveSources(n + s, 3 - n); + while (n < 3) + tex->setSrc(s + n++, new_LValue(func, FILE_GPR)); + } } else { - s = tex->srcCount(0xff); + s = tex->srcCount(0xff, true); n = 0; } @@ -2366,14 +2376,18 @@ RegAlloc::InsertConstraintsPass::texConstraintNVE0(TexInstruction *tex) } else if (isTextureOp(tex->op)) { int n = tex->srcCount(0xff, true); - if (n > 4) { - condenseSrcs(tex, 0, 3); - if (n > 5) // NOTE: first call modified positions already - condenseSrcs(tex, 4 - (4 - 1), n - 1 - (4 - 1)); - } else - if (n > 1) { - condenseSrcs(tex, 0, n - 1); + int s = n > 4 ? 4 : n; + if (n > 4 && n < 7) { + if (tex->srcExists(n)) // move potential predicate out of the way + tex->moveSources(n, 7 - n); + + while (n < 7) + tex->setSrc(n++, new_LValue(func, FILE_GPR)); } + if (s > 1) + condenseSrcs(tex, 0, s - 1); + if (n > 4) + condenseSrcs(tex, 1, n - s); } } @@ -2510,6 +2524,7 @@ RegAlloc::InsertConstraintsPass::insertConstraintMove(Instruction *cst, int s) assert(cst->getSrc(s)->defs.size() == 1); // still SSA Instruction *defi = cst->getSrc(s)->defs.front()->getInsn(); + bool imm = defi->op == OP_MOV && defi->src(0).getFile() == FILE_IMMEDIATE; bool load = defi->op == OP_LOAD && diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp index 9193a01f189..5c6d0570ae2 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp @@ -399,6 +399,7 @@ Program::emitBinary(struct nv50_ir_prog_info *info) } } } + info->io.fp64 |= fp64; info->bin.relocData = emit->getRelocInfo(); info->bin.fixupData = emit->getFixupInfo(); diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c index 2b69a8f6968..53551ebc037 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c +++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c @@ -79,6 +79,9 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) return 2048; case PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET: return 8 * 1024 * 1024; + case PIPE_CAP_MAX_VARYINGS: + return 8; + /* supported capabilities */ case PIPE_CAP_ANISOTROPIC_FILTER: case PIPE_CAP_POINT_SPRITE: diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query.c b/src/gallium/drivers/nouveau/nv50/nv50_query.c index e30380cd84d..13088ebb5fa 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_query.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_query.c @@ -98,12 +98,10 @@ nv50_render_condition(struct pipe_context *pipe, case PIPE_QUERY_OCCLUSION_COUNTER: case PIPE_QUERY_OCCLUSION_PREDICATE: case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: + if (hq->state == NV50_HW_QUERY_STATE_READY) + wait = true; if (likely(!condition)) { - if (unlikely(hq->nesting)) - cond = wait ? NV50_3D_COND_MODE_NOT_EQUAL : - NV50_3D_COND_MODE_ALWAYS; - else - cond = NV50_3D_COND_MODE_RES_NON_ZERO; + cond = wait ? NV50_3D_COND_MODE_NOT_EQUAL : NV50_3D_COND_MODE_ALWAYS; } else { cond = wait ? NV50_3D_COND_MODE_EQUAL : NV50_3D_COND_MODE_ALWAYS; } @@ -129,7 +127,7 @@ nv50_render_condition(struct pipe_context *pipe, PUSH_SPACE(push, 9); - if (wait) { + if (wait && hq->state != NV50_HW_QUERY_STATE_READY) { BEGIN_NV04(push, SUBC_3D(NV50_GRAPH_SERIALIZE), 1); PUSH_DATA (push, 0); } diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c index ac3e409b2d5..4e74c462235 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c @@ -29,11 +29,6 @@ #include "nv50/nv50_query_hw_sm.h" #include "nv_object.xml.h" -#define NV50_HW_QUERY_STATE_READY 0 -#define NV50_HW_QUERY_STATE_ACTIVE 1 -#define NV50_HW_QUERY_STATE_ENDED 2 -#define NV50_HW_QUERY_STATE_FLUSHED 3 - /* XXX: Nested queries, and simultaneous queries on multiple gallium contexts * (since we use only a single GPU channel per screen) will not work properly. * @@ -158,8 +153,7 @@ nv50_hw_begin_query(struct nv50_context *nv50, struct nv50_query *q) case PIPE_QUERY_OCCLUSION_COUNTER: case PIPE_QUERY_OCCLUSION_PREDICATE: case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: - hq->nesting = nv50->screen->num_occlusion_queries_active++; - if (hq->nesting) { + if (nv50->screen->num_occlusion_queries_active++) { nv50_hw_query_get(push, q, 0x10, 0x0100f002); } else { PUSH_SPACE(push, 4); diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.h b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.h index 82ec6bd2d96..a89a66cec4f 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.h +++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.h @@ -6,6 +6,11 @@ #include "nv50_query.h" +#define NV50_HW_QUERY_STATE_READY 0 +#define NV50_HW_QUERY_STATE_ACTIVE 1 +#define NV50_HW_QUERY_STATE_ENDED 2 +#define NV50_HW_QUERY_STATE_FLUSHED 3 + #define NVA0_HW_QUERY_STREAM_OUTPUT_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0) struct nv50_hw_query; @@ -29,7 +34,6 @@ struct nv50_hw_query { uint8_t state; bool is64bit; uint8_t rotate; - int nesting; /* only used for occlusion queries */ struct nouveau_mm_allocation *mm; struct nouveau_fence *fence; }; diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c index 0c53b22eb3c..8e65eaf50b1 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c @@ -156,6 +156,8 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) return NV50_MAX_WINDOW_RECTANGLES; case PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET: return 16 * 1024 * 1024; + case PIPE_CAP_MAX_VARYINGS: + return 15; /* supported caps */ case PIPE_CAP_TEXTURE_MIRROR_CLAMP: @@ -215,6 +217,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_TGSI_CLOCK: case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX: case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION: + case PIPE_CAP_DEST_SURFACE_SRGB_CONTROL: return 1; case PIPE_CAP_SEAMLESS_CUBE_MAP: return 1; /* class_3d >= NVA0_3D_CLASS; */ @@ -312,6 +315,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_TGSI_ATOMFADD: case PIPE_CAP_QUERY_PIPELINE_STATISTICS_SINGLE: case PIPE_CAP_RGB_OVERRIDE_DST_ALPHA_BLEND: + case PIPE_CAP_GLSL_TESS_LEVELS_AS_INPUTS: return 0; case PIPE_CAP_VENDOR_ID: diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h index eb057bf2489..c1351062676 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h @@ -434,6 +434,7 @@ nvc0_video_buffer_create(struct pipe_context *pipe, /* nvc0_push.c */ void nvc0_push_vbo(struct nvc0_context *, const struct pipe_draw_info *); +void nvc0_push_vbo_indirect(struct nvc0_context *, const struct pipe_draw_info *); /* nve4_compute.c */ void nve4_launch_grid(struct pipe_context *, const struct pipe_grid_info *); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c index 1a3e4e794c0..40af9936859 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c @@ -121,12 +121,10 @@ nvc0_render_condition(struct pipe_context *pipe, case PIPE_QUERY_OCCLUSION_COUNTER: case PIPE_QUERY_OCCLUSION_PREDICATE: case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: + if (hq->state == NVC0_HW_QUERY_STATE_READY) + wait = true; if (likely(!condition)) { - if (unlikely(hq->nesting)) - cond = wait ? NVC0_3D_COND_MODE_NOT_EQUAL : - NVC0_3D_COND_MODE_ALWAYS; - else - cond = NVC0_3D_COND_MODE_RES_NON_ZERO; + cond = wait ? NVC0_3D_COND_MODE_NOT_EQUAL : NVC0_3D_COND_MODE_ALWAYS; } else { cond = wait ? NVC0_3D_COND_MODE_EQUAL : NVC0_3D_COND_MODE_ALWAYS; } @@ -151,7 +149,7 @@ nvc0_render_condition(struct pipe_context *pipe, return; } - if (wait) + if (wait && hq->state != NVC0_HW_QUERY_STATE_READY) nvc0_hw_query_fifo_wait(nvc0, q); PUSH_SPACE(push, 10); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c index a420ed4ac0d..f6d5d0f5602 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c @@ -28,11 +28,6 @@ #include "nvc0/nvc0_query_hw_metric.h" #include "nvc0/nvc0_query_hw_sm.h" -#define NVC0_HW_QUERY_STATE_READY 0 -#define NVC0_HW_QUERY_STATE_ACTIVE 1 -#define NVC0_HW_QUERY_STATE_ENDED 2 -#define NVC0_HW_QUERY_STATE_FLUSHED 3 - #define NVC0_HW_QUERY_ALLOC_SPACE 256 bool @@ -158,14 +153,18 @@ nvc0_hw_begin_query(struct nvc0_context *nvc0, struct nvc0_query *q) case PIPE_QUERY_OCCLUSION_COUNTER: case PIPE_QUERY_OCCLUSION_PREDICATE: case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: - hq->nesting = nvc0->screen->num_occlusion_queries_active++; - if (hq->nesting) { + if (nvc0->screen->num_occlusion_queries_active++) { nvc0_hw_query_get(push, q, 0x10, 0x0100f002); } else { PUSH_SPACE(push, 3); BEGIN_NVC0(push, NVC0_3D(COUNTER_RESET), 1); PUSH_DATA (push, NVC0_3D_COUNTER_RESET_SAMPLECNT); IMMED_NVC0(push, NVC0_3D(SAMPLECNT_ENABLE), 1); + /* Given that the counter is reset, the contents at 0x10 are + * equivalent to doing the query -- we would get hq->sequence as the + * payload and 0 as the reported value. This is already set up above + * as in the hq->rotate case. + */ } break; case PIPE_QUERY_PRIMITIVES_GENERATED: @@ -199,6 +198,7 @@ nvc0_hw_begin_query(struct nvc0_context *nvc0, struct nvc0_query *q) nvc0_hw_query_get(push, q, 0xc0 + 0x70, 0x0980a002); /* ROP, PIXELS */ nvc0_hw_query_get(push, q, 0xc0 + 0x80, 0x0d808002); /* TCP, LAUNCHES */ nvc0_hw_query_get(push, q, 0xc0 + 0x90, 0x0e809002); /* TEP, LAUNCHES */ + ((uint64_t *)hq->data)[(12 + 10) * 2] = 0; break; default: break; @@ -271,6 +271,7 @@ nvc0_hw_end_query(struct nvc0_context *nvc0, struct nvc0_query *q) nvc0_hw_query_get(push, q, 0x70, 0x0980a002); /* ROP, PIXELS */ nvc0_hw_query_get(push, q, 0x80, 0x0d808002); /* TCP, LAUNCHES */ nvc0_hw_query_get(push, q, 0x90, 0x0e809002); /* TEP, LAUNCHES */ + ((uint64_t *)hq->data)[10 * 2] = 0; break; case PIPE_QUERY_TIMESTAMP_DISJOINT: /* This query is not issued on GPU because disjoint is forced to false */ diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h index 8225755d85e..5c8ad5eb2d0 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h @@ -6,6 +6,11 @@ #include "nvc0_query.h" +#define NVC0_HW_QUERY_STATE_READY 0 +#define NVC0_HW_QUERY_STATE_ACTIVE 1 +#define NVC0_HW_QUERY_STATE_ENDED 2 +#define NVC0_HW_QUERY_STATE_FLUSHED 3 + #define NVC0_HW_QUERY_TFB_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0) struct nvc0_hw_query; @@ -29,7 +34,6 @@ struct nvc0_hw_query { uint8_t state; boolean is64bit; uint8_t rotate; - int nesting; /* only used for occlusion queries */ struct nouveau_mm_allocation *mm; struct nouveau_fence *fence; }; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c index 68b5869276a..553fe324bc7 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c @@ -182,6 +182,13 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) return class_3d >= GM200_3D_CLASS ? 8 : 0; case PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET: return 64 * 1024 * 1024; + case PIPE_CAP_MAX_VARYINGS: + /* NOTE: These only count our slots for GENERIC varyings. + * The address space may be larger, but the actual hard limit seems to be + * less than what the address space layout permits, so don't add TEXCOORD, + * COLOR, etc. here. + */ + return 0x1f0 / 16; /* supported caps */ case PIPE_CAP_TEXTURE_MIRROR_CLAMP: @@ -266,6 +273,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX: case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION: case PIPE_CAP_QUERY_SO_OVERFLOW: + case PIPE_CAP_DEST_SURFACE_SRGB_CONTROL: return 1; case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER: return nouveau_screen(pscreen)->vram_domain & NOUVEAU_BO_VRAM ? 1 : 0; @@ -336,6 +344,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_SURFACE_SAMPLE_COUNT: case PIPE_CAP_QUERY_PIPELINE_STATISTICS_SINGLE: case PIPE_CAP_RGB_OVERRIDE_DST_ALPHA_BLEND: + case PIPE_CAP_GLSL_TESS_LEVELS_AS_INPUTS: return 0; case PIPE_CAP_VENDOR_ID: @@ -392,18 +401,6 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH: return 16; case PIPE_SHADER_CAP_MAX_INPUTS: - if (shader == PIPE_SHADER_VERTEX) - return 32; - /* NOTE: These only count our slots for GENERIC varyings. - * The address space may be larger, but the actual hard limit seems to be - * less than what the address space layout permits, so don't add TEXCOORD, - * COLOR, etc. here. - */ - if (shader == PIPE_SHADER_FRAGMENT) - return 0x1f0 / 16; - /* Actually this counts CLIPVERTEX, which occupies the last generic slot, - * and excludes 0x60 per-patch inputs. - */ return 0x200 / 16; case PIPE_SHADER_CAP_MAX_OUTPUTS: return 32; @@ -1286,8 +1283,8 @@ nvc0_screen_create(struct nouveau_device *dev) for (i = 0; i < NVC0_MAX_VIEWPORTS; i++) { BEGIN_NVC0(push, NVC0_3D(SCISSOR_ENABLE(i)), 3); PUSH_DATA (push, 1); - PUSH_DATA (push, 8192 << 16); - PUSH_DATA (push, 8192 << 16); + PUSH_DATA (push, 16384 << 16); + PUSH_DATA (push, 16384 << 16); } #define MK_MACRO(m, n) i = nvc0_graph_set_macro(screen, m, i, sizeof(n), n); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c index 04f0a0d55da..8820b5aac66 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c @@ -1051,21 +1051,13 @@ nve4_set_surface_info(struct nouveau_pushbuf *push, } else { struct nv50_miptree *mt = nv50_miptree(&res->base); struct nv50_miptree_level *lvl = &mt->level[view->u.tex.level]; - const unsigned z = view->u.tex.first_layer; - - if (z) { - if (mt->layout_3d) { - address += nvc0_mt_zslice_offset(mt, view->u.tex.level, z); - /* doesn't work if z passes z-tile boundary */ - if (depth > 1) { - pipe_debug_message(&nvc0->base.debug, CONFORMANCE, - "3D images are not really supported!"); - debug_printf("3D images are not really supported!\n"); - } - } else { - address += mt->layer_stride * z; - } + unsigned z = view->u.tex.first_layer; + + if (!mt->layout_3d) { + address += mt->layer_stride * z; + z = 0; } + address += lvl->offset; info[0] = address >> 8; @@ -1080,7 +1072,8 @@ nve4_set_surface_info(struct nouveau_pushbuf *push, info[6] = depth - 1; info[6] |= (lvl->tile_mode & 0xf00) << 21; info[6] |= NVC0_TILE_SHIFT_Z(lvl->tile_mode) << 22; - info[7] = 0; + info[7] = mt->layout_3d ? 1 : 0; + info[7] |= z << 16; info[14] = mt->ms_x; info[15] = mt->ms_y; } diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c index 3fbe7614e52..7d6be9382d1 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c @@ -1040,7 +1040,10 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) } if (nvc0->state.vbo_mode) { - nvc0_push_vbo(nvc0, info); + if (info->indirect) + nvc0_push_vbo_indirect(nvc0, info); + else + nvc0_push_vbo(nvc0, info); goto cleanup; } diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c index 256e20df2e4..4333fb26d23 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c @@ -466,6 +466,83 @@ nvc0_prim_gl(unsigned prim) } } +typedef struct { + uint32_t count; + uint32_t primCount; + uint32_t first; + uint32_t baseInstance; +} DrawArraysIndirectCommand; + +typedef struct { + uint32_t count; + uint32_t primCount; + uint32_t firstIndex; + int32_t baseVertex; + uint32_t baseInstance; +} DrawElementsIndirectCommand; + +void +nvc0_push_vbo_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info) +{ + /* The strategy here is to just read the commands from the indirect buffer + * and do the draws. This is suboptimal, but will only happen in the case + * that conversion is required for FIXED or DOUBLE inputs. + */ + struct nvc0_screen *screen = nvc0->screen; + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + struct nv04_resource *buf = nv04_resource(info->indirect->buffer); + struct nv04_resource *buf_count = nv04_resource(info->indirect->indirect_draw_count); + unsigned i; + + unsigned draw_count = info->indirect->draw_count; + if (buf_count) { + uint32_t *count = nouveau_resource_map_offset( + &nvc0->base, buf_count, info->indirect->indirect_draw_count_offset, + NOUVEAU_BO_RD); + draw_count = *count; + } + + uint8_t *buf_data = nouveau_resource_map_offset( + &nvc0->base, buf, info->indirect->offset, NOUVEAU_BO_RD); + struct pipe_draw_info single = *info; + single.indirect = NULL; + for (i = 0; i < draw_count; i++, buf_data += info->indirect->stride) { + if (info->index_size) { + DrawElementsIndirectCommand *cmd = (void *)buf_data; + single.start = info->start + cmd->firstIndex; + single.count = cmd->count; + single.start_instance = cmd->baseInstance; + single.instance_count = cmd->primCount; + single.index_bias = cmd->baseVertex; + } else { + DrawArraysIndirectCommand *cmd = (void *)buf_data; + single.start = cmd->first; + single.count = cmd->count; + single.start_instance = cmd->baseInstance; + single.instance_count = cmd->primCount; + } + + if (nvc0->vertprog->vp.need_draw_parameters) { + PUSH_SPACE(push, 9); + BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3); + PUSH_DATA (push, NVC0_CB_AUX_SIZE); + PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0)); + PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0)); + BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 3); + PUSH_DATA (push, NVC0_CB_AUX_DRAW_INFO); + PUSH_DATA (push, single.index_bias); + PUSH_DATA (push, single.start_instance); + PUSH_DATA (push, single.drawid + i); + } + + nvc0_push_vbo(nvc0, &single); + } + + nouveau_resource_unmap(buf); + if (buf_count) + nouveau_resource_unmap(buf_count); +} + void nvc0_push_vbo(struct nvc0_context *nvc0, const struct pipe_draw_info *info) { diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c index 19d3a1bae30..be0b475e5ef 100644 --- a/src/gallium/drivers/r300/r300_screen.c +++ b/src/gallium/drivers/r300/r300_screen.c @@ -304,6 +304,9 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE: return 2048; + case PIPE_CAP_MAX_VARYINGS: + return 10; + case PIPE_CAP_VENDOR_ID: return 0x1002; case PIPE_CAP_DEVICE_ID: diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c index ade1a94ab32..41a878ab9d2 100644 --- a/src/gallium/drivers/r600/r600_pipe.c +++ b/src/gallium/drivers/r600/r600_pipe.c @@ -536,6 +536,9 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_MAX_TEXEL_OFFSET: return 7; + case PIPE_CAP_MAX_VARYINGS: + return 32; + case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK: return PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_R600; case PIPE_CAP_ENDIANNESS: diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c index f86764f5220..96ffbf82927 100644 --- a/src/gallium/drivers/r600/r600_state.c +++ b/src/gallium/drivers/r600/r600_state.c @@ -1837,18 +1837,17 @@ static void r600_emit_sampler_states(struct r600_context *rctx, /* TEX_ARRAY_OVERRIDE must be set for array textures to disable * filtering between layers. - * Don't update TEX_ARRAY_OVERRIDE if we don't have the sampler view. */ - if (rview) { - enum pipe_texture_target target = rview->base.texture->target; - if (target == PIPE_TEXTURE_1D_ARRAY || - target == PIPE_TEXTURE_2D_ARRAY) { - rstate->tex_sampler_words[0] |= S_03C000_TEX_ARRAY_OVERRIDE(1); - texinfo->is_array_sampler[i] = true; - } else { - rstate->tex_sampler_words[0] &= C_03C000_TEX_ARRAY_OVERRIDE; - texinfo->is_array_sampler[i] = false; - } + enum pipe_texture_target target = PIPE_BUFFER; + if (rview) + target = rview->base.texture->target; + if (target == PIPE_TEXTURE_1D_ARRAY || + target == PIPE_TEXTURE_2D_ARRAY) { + rstate->tex_sampler_words[0] |= S_03C000_TEX_ARRAY_OVERRIDE(1); + texinfo->is_array_sampler[i] = true; + } else { + rstate->tex_sampler_words[0] &= C_03C000_TEX_ARRAY_OVERRIDE; + texinfo->is_array_sampler[i] = false; } radeon_emit(cs, PKT3(PKT3_SET_SAMPLER, 3, 0)); diff --git a/src/gallium/drivers/radeon/radeon_vcn_dec.c b/src/gallium/drivers/radeon/radeon_vcn_dec.c index a4e6d9dc6b5..688cef90103 100644 --- a/src/gallium/drivers/radeon/radeon_vcn_dec.c +++ b/src/gallium/drivers/radeon/radeon_vcn_dec.c @@ -64,6 +64,7 @@ static rvcn_dec_message_avc_t get_h264_msg(struct radeon_decoder *dec, memset(&result, 0, sizeof(result)); switch (pic->base.profile) { case PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE: + case PIPE_VIDEO_PROFILE_MPEG4_AVC_CONSTRAINED_BASELINE: result.profile = RDECODE_H264_PROFILE_BASELINE; break; @@ -490,7 +491,7 @@ static rvcn_dec_message_vp9_t get_vp9_msg(struct radeon_decoder *dec, assert(dec->base.max_references + 1 <= 16); - for (i = 0 ; i < dec->base.max_references + 1 ; ++i) { + for (i = 0 ; i < 16 ; ++i) { if (dec->render_pic_list[i] && dec->render_pic_list[i] == target) { result.curr_pic_idx = (uintptr_t)vl_video_buffer_get_associated_data(target, &dec->base); diff --git a/src/gallium/drivers/radeonsi/Makefile.sources b/src/gallium/drivers/radeonsi/Makefile.sources index 713629c6e87..3cdd0851a5c 100644 --- a/src/gallium/drivers/radeonsi/Makefile.sources +++ b/src/gallium/drivers/radeonsi/Makefile.sources @@ -14,6 +14,7 @@ C_SOURCES := \ si_compute_blit.c \ si_cp_dma.c \ si_debug.c \ + si_debug_options.h \ si_descriptors.c \ si_dma.c \ si_dma_cs.c \ diff --git a/src/gallium/drivers/radeonsi/driinfo_radeonsi.h b/src/gallium/drivers/radeonsi/driinfo_radeonsi.h index cbf3bb01fb3..000a300746e 100644 --- a/src/gallium/drivers/radeonsi/driinfo_radeonsi.h +++ b/src/gallium/drivers/radeonsi/driinfo_radeonsi.h @@ -11,5 +11,14 @@ DRI_CONF_SECTION_PERFORMANCE DRI_CONF_SECTION_END DRI_CONF_SECTION_DEBUG - DRI_CONF_RADEONSI_CLEAR_DB_CACHE_BEFORE_CLEAR("false") + +//= BEGIN VERBATIM +#define OPT_BOOL(name, dflt, description) \ + DRI_CONF_OPT_BEGIN_B(radeonsi_##name, #dflt) \ + DRI_CONF_DESC(en, description) \ + DRI_CONF_OPT_END + +#include "radeonsi/si_debug_options.h" +//= END VERBATIM + DRI_CONF_SECTION_END diff --git a/src/gallium/drivers/radeonsi/si_buffer.c b/src/gallium/drivers/radeonsi/si_buffer.c index 03c11cb7013..3845e56a4b3 100644 --- a/src/gallium/drivers/radeonsi/si_buffer.c +++ b/src/gallium/drivers/radeonsi/si_buffer.c @@ -521,10 +521,13 @@ static void si_buffer_do_flush_region(struct pipe_context *ctx, struct si_resource *buf = si_resource(transfer->resource); if (stransfer->staging) { + unsigned src_offset = stransfer->offset + + transfer->box.x % SI_MAP_BUFFER_ALIGNMENT + + (box->x - transfer->box.x); + /* Copy the staging buffer into the original one. */ si_copy_buffer((struct si_context*)ctx, transfer->resource, - &stransfer->staging->b.b, box->x, - stransfer->offset + box->x % SI_MAP_BUFFER_ALIGNMENT, + &stransfer->staging->b.b, box->x, src_offset, box->width); } diff --git a/src/gallium/drivers/radeonsi/si_clear.c b/src/gallium/drivers/radeonsi/si_clear.c index 9026f61dc0a..ef25c79fa9c 100644 --- a/src/gallium/drivers/radeonsi/si_clear.c +++ b/src/gallium/drivers/radeonsi/si_clear.c @@ -272,7 +272,7 @@ void vi_dcc_clear_level(struct si_context *sctx, } si_clear_buffer(sctx, dcc_buffer, dcc_offset, clear_size, - &clear_value, 4, SI_COHERENCY_CB_META); + &clear_value, 4, SI_COHERENCY_CB_META, false); } /* Set the same micro tile mode as the destination of the last MSAA resolve. @@ -505,7 +505,7 @@ static void si_do_fast_color_clear(struct si_context *sctx, uint32_t clear_value = 0xCCCCCCCC; si_clear_buffer(sctx, &tex->cmask_buffer->b.b, tex->cmask_offset, tex->surface.cmask_size, - &clear_value, 4, SI_COHERENCY_CB_META); + &clear_value, 4, SI_COHERENCY_CB_META, false); fmask_decompress_needed = true; } @@ -533,7 +533,7 @@ static void si_do_fast_color_clear(struct si_context *sctx, uint32_t clear_value = 0; si_clear_buffer(sctx, &tex->cmask_buffer->b.b, tex->cmask_offset, tex->surface.cmask_size, - &clear_value, 4, SI_COHERENCY_CB_META); + &clear_value, 4, SI_COHERENCY_CB_META, false); eliminate_needed = true; } @@ -647,7 +647,7 @@ static void si_clear(struct pipe_context *ctx, unsigned buffers, * This hack decreases back-to-back ClearDepth performance. */ if ((sctx->db_depth_clear || sctx->db_stencil_clear) && - sctx->screen->clear_db_cache_before_clear) + sctx->screen->options.clear_db_cache_before_clear) sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB; } diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c index 38c48c30be9..304296c4a52 100644 --- a/src/gallium/drivers/radeonsi/si_compute_blit.c +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c @@ -177,7 +177,8 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset, uint64_t size, uint32_t *clear_value, - uint32_t clear_value_size, enum si_coherency coher) + uint32_t clear_value_size, enum si_coherency coher, + bool force_cpdma) { if (!size) return; @@ -241,7 +242,8 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, * about buffer placements. */ if (clear_value_size > 4 || - (clear_value_size == 4 && + (!force_cpdma && + clear_value_size == 4 && offset % 4 == 0 && (size > 32*1024 || sctx->chip_class <= VI))) { si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0, @@ -282,7 +284,7 @@ static void si_pipe_clear_buffer(struct pipe_context *ctx, coher = SI_COHERENCY_SHADER; si_clear_buffer((struct si_context*)ctx, dst, offset, size, (uint32_t*)clear_value, - clear_value_size, coher); + clear_value_size, coher, false); } void si_copy_buffer(struct si_context *sctx, diff --git a/src/gallium/drivers/radeonsi/si_debug_options.h b/src/gallium/drivers/radeonsi/si_debug_options.h new file mode 100644 index 00000000000..165dba8baf5 --- /dev/null +++ b/src/gallium/drivers/radeonsi/si_debug_options.h @@ -0,0 +1,4 @@ +OPT_BOOL(clear_db_cache_before_clear, false, "Clear DB cache before fast depth clear") +OPT_BOOL(enable_nir, false, "Enable NIR") + +#undef OPT_BOOL diff --git a/src/gallium/drivers/radeonsi/si_get.c b/src/gallium/drivers/radeonsi/si_get.c index bb2d8c09eb1..ff25a976e77 100644 --- a/src/gallium/drivers/radeonsi/si_get.c +++ b/src/gallium/drivers/radeonsi/si_get.c @@ -208,7 +208,7 @@ static int si_get_param(struct pipe_screen *pscreen, enum pipe_cap param) RADEON_SPARSE_PAGE_SIZE : 0; case PIPE_CAP_PACKED_UNIFORMS: - if (sscreen->debug_flags & DBG(NIR)) + if (sscreen->options.enable_nir) return 1; return 0; @@ -254,6 +254,9 @@ static int si_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: return 30; + case PIPE_CAP_MAX_VARYINGS: + return 32; + case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK: return sscreen->info.chip_class <= VI ? PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_R600 : 0; @@ -420,11 +423,11 @@ static int si_get_shader_param(struct pipe_screen* pscreen, case PIPE_SHADER_CAP_MAX_SHADER_IMAGES: return SI_NUM_IMAGES; case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: - if (sscreen->debug_flags & DBG(NIR)) + if (sscreen->options.enable_nir) return 0; return 32; case PIPE_SHADER_CAP_PREFERRED_IR: - if (sscreen->debug_flags & DBG(NIR)) + if (sscreen->options.enable_nir) return PIPE_SHADER_IR_NIR; return PIPE_SHADER_IR_TGSI; case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD: diff --git a/src/gallium/drivers/radeonsi/si_perfcounter.c b/src/gallium/drivers/radeonsi/si_perfcounter.c index 2da14f8868f..d55394f2cba 100644 --- a/src/gallium/drivers/radeonsi/si_perfcounter.c +++ b/src/gallium/drivers/radeonsi/si_perfcounter.c @@ -1333,7 +1333,7 @@ void si_init_perfcounters(struct si_screen *screen) for (i = 0; i < num_blocks; ++i) { struct si_pc_block *block = &pc->blocks[i]; block->b = &blocks[i]; - block->num_instances = block->b->instances; + block->num_instances = MAX2(1, block->b->instances); if (!strcmp(block->b->b->name, "CB") || !strcmp(block->b->b->name, "DB")) diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 41d395d7d3f..507ca65605f 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -61,7 +61,6 @@ static const struct debug_named_value debug_options[] = { /* Shader compiler options (with no effect on the shader cache): */ { "checkir", DBG(CHECK_IR), "Enable additional sanity checks on shader IR" }, - { "nir", DBG(NIR), "Enable experimental NIR shaders" }, { "mono", DBG(MONOLITHIC_SHADERS), "Use old-style monolithic shaders compiled on demand" }, { "nooptvariant", DBG(NO_OPT_VARIANT), "Disable compiling optimized shader variants." }, @@ -609,11 +608,14 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, si_begin_new_gfx_cs(sctx); if (sctx->chip_class == CIK) { - /* Clear the NULL constant buffer, because loads should return zeros. */ + /* Clear the NULL constant buffer, because loads should return zeros. + * Note that this forces CP DMA to be used, because clover deadlocks + * for some reason when the compute codepath is used. + */ uint32_t clear_value = 0; si_clear_buffer(sctx, sctx->null_const_buf.buffer, 0, sctx->null_const_buf.buffer->width0, - &clear_value, 4, SI_COHERENCY_SHADER); + &clear_value, 4, SI_COHERENCY_SHADER, true); } return &sctx->b; fail: @@ -804,8 +806,7 @@ static void si_disk_cache_create(struct si_screen *sscreen) #define ALL_FLAGS (DBG(FS_CORRECT_DERIVS_AFTER_KILL) | \ DBG(SI_SCHED) | \ DBG(GISEL) | \ - DBG(UNSAFE_MATH) | \ - DBG(NIR)) + DBG(UNSAFE_MATH)) uint64_t shader_debug_flags = sscreen->debug_flags & ALL_FLAGS; @@ -813,7 +814,11 @@ static void si_disk_cache_create(struct si_screen *sscreen) * how 32-bit addresses are expanded to 64 bits. */ STATIC_ASSERT(ALL_FLAGS <= UINT_MAX); - shader_debug_flags |= (uint64_t)sscreen->info.address32_hi << 32; + assert((int16_t)sscreen->info.address32_hi == (int32_t)sscreen->info.address32_hi); + shader_debug_flags |= (uint64_t)(sscreen->info.address32_hi & 0xffff) << 32; + + if (sscreen->options.enable_nir) + shader_debug_flags |= 1ull << 48; sscreen->disk_shader_cache = disk_cache_create(sscreen->info.name, @@ -866,7 +871,6 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws, if (driQueryOptionb(config->options, "radeonsi_enable_sisched")) sscreen->debug_flags |= DBG(SI_SCHED); - if (sscreen->debug_flags & DBG(INFO)) ac_print_gpu_info(&sscreen->info); @@ -1013,8 +1017,16 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws, driQueryOptionb(config->options, "radeonsi_assume_no_z_fights"); sscreen->commutative_blend_add = driQueryOptionb(config->options, "radeonsi_commutative_blend_add"); - sscreen->clear_db_cache_before_clear = - driQueryOptionb(config->options, "radeonsi_clear_db_cache_before_clear"); + + { +#define OPT_BOOL(name, dflt, description) \ + sscreen->options.name = \ + driQueryOptionb(config->options, "radeonsi_"#name); +#include "si_debug_options.h" + } + + sscreen->has_gfx9_scissor_bug = sscreen->info.family == CHIP_VEGA10 || + sscreen->info.family == CHIP_RAVEN; sscreen->has_msaa_sample_loc_bug = (sscreen->info.family >= CHIP_POLARIS10 && sscreen->info.family <= CHIP_POLARIS12) || sscreen->info.family == CHIP_VEGA10 || diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index eb3ba951dae..ea009622970 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -133,7 +133,6 @@ enum { /* Shader compiler options (with no effect on the shader cache): */ DBG_CHECK_IR, - DBG_NIR, DBG_MONOLITHIC_SHADERS, DBG_NO_OPT_VARIANT, @@ -445,7 +444,7 @@ struct si_screen { bool has_out_of_order_rast; bool assume_no_z_fights; bool commutative_blend_add; - bool clear_db_cache_before_clear; + bool has_gfx9_scissor_bug; bool has_msaa_sample_loc_bug; bool has_ls_vgpr_init_bug; bool has_dcc_constant_encode; @@ -453,6 +452,11 @@ struct si_screen { bool dfsm_allowed; bool llvm_has_working_vgpr_indexing; + struct { +#define OPT_BOOL(name, dflt, description) bool name:1; +#include "si_debug_options.h" + } options; + /* Whether shaders are monolithic (1-part) or separate (3-part). */ bool use_monolithic_shaders; bool record_llvm_ir; @@ -1054,7 +1058,7 @@ struct si_context { unsigned num_resident_handles; uint64_t num_alloc_tex_transfer_bytes; unsigned last_tex_ps_draw_ratio; /* for query */ - unsigned context_roll_counter; + unsigned context_roll; /* Queries. */ /* Maintain the list of active queries for pausing between IBs. */ @@ -1168,7 +1172,8 @@ unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher, enum si_cache_policy cache_policy); void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset, uint64_t size, uint32_t *clear_value, - uint32_t clear_value_size, enum si_coherency coher); + uint32_t clear_value_size, enum si_coherency coher, + bool force_cpdma); void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset, unsigned size); diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c index 266b9d3ce84..280eee3a280 100644 --- a/src/gallium/drivers/radeonsi/si_query.c +++ b/src/gallium/drivers/radeonsi/si_query.c @@ -549,11 +549,15 @@ void si_query_buffer_reset(struct si_context *sctx, struct si_query_buffer *buff } buffer->results_end = 0; + if (!buffer->buf) + return; + /* Discard even the oldest buffer if it can't be mapped without a stall. */ - if (buffer->buf && - (si_rings_is_buffer_referenced(sctx, buffer->buf->buf, RADEON_USAGE_READWRITE) || - !sctx->ws->buffer_wait(buffer->buf->buf, 0, RADEON_USAGE_READWRITE))) { + if (si_rings_is_buffer_referenced(sctx, buffer->buf->buf, RADEON_USAGE_READWRITE) || + !sctx->ws->buffer_wait(buffer->buf->buf, 0, RADEON_USAGE_READWRITE)) { si_resource_reference(&buffer->buf, NULL); + } else { + buffer->unprepared = true; } } @@ -561,29 +565,31 @@ bool si_query_buffer_alloc(struct si_context *sctx, struct si_query_buffer *buff bool (*prepare_buffer)(struct si_context *, struct si_query_buffer*), unsigned size) { - if (buffer->buf && buffer->results_end + size >= buffer->buf->b.b.width0) - return true; + bool unprepared = buffer->unprepared; + buffer->unprepared = false; + + if (!buffer->buf || buffer->results_end + size > buffer->buf->b.b.width0) { + if (buffer->buf) { + struct si_query_buffer *qbuf = MALLOC_STRUCT(si_query_buffer); + memcpy(qbuf, buffer, sizeof(*qbuf)); + buffer->previous = qbuf; + } + buffer->results_end = 0; - if (buffer->buf) { - struct si_query_buffer *qbuf = MALLOC_STRUCT(si_query_buffer); - memcpy(qbuf, buffer, sizeof(*qbuf)); - buffer->previous = qbuf; + /* Queries are normally read by the CPU after + * being written by the gpu, hence staging is probably a good + * usage pattern. + */ + struct si_screen *screen = sctx->screen; + unsigned buf_size = MAX2(size, screen->info.min_alloc_size); + buffer->buf = si_resource( + pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size)); + if (unlikely(!buffer->buf)) + return false; + unprepared = true; } - buffer->results_end = 0; - - /* Queries are normally read by the CPU after - * being written by the gpu, hence staging is probably a good - * usage pattern. - */ - struct si_screen *screen = sctx->screen; - unsigned buf_size = MAX2(size, screen->info.min_alloc_size); - buffer->buf = si_resource( - pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size)); - if (unlikely(!buffer->buf)) - return false; - - if (prepare_buffer) { + if (unprepared && prepare_buffer) { if (unlikely(!prepare_buffer(sctx, buffer))) { si_resource_reference(&buffer->buf, NULL); return false; diff --git a/src/gallium/drivers/radeonsi/si_query.h b/src/gallium/drivers/radeonsi/si_query.h index aaf0bd03aca..c61af51d57c 100644 --- a/src/gallium/drivers/radeonsi/si_query.h +++ b/src/gallium/drivers/radeonsi/si_query.h @@ -177,12 +177,13 @@ struct si_query_hw_ops { struct si_query_buffer { /* The buffer where query results are stored. */ struct si_resource *buf; - /* Offset of the next free result after current query data */ - unsigned results_end; /* If a query buffer is full, a new buffer is created and the old one * is put in here. When we calculate the result, we sum up the samples * from all buffers. */ struct si_query_buffer *previous; + /* Offset of the next free result after current query data */ + unsigned results_end; + bool unprepared; }; void si_query_buffer_destroy(struct si_screen *sctx, struct si_query_buffer *buffer); diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c index 7554f5b9f8b..d7618b46eb0 100644 --- a/src/gallium/drivers/radeonsi/si_shader_nir.c +++ b/src/gallium/drivers/radeonsi/si_shader_nir.c @@ -834,7 +834,7 @@ si_lower_nir(struct si_shader_selector* sel) NIR_PASS(progress, sel->nir, nir_opt_if); NIR_PASS(progress, sel->nir, nir_opt_dead_cf); NIR_PASS(progress, sel->nir, nir_opt_cse); - NIR_PASS(progress, sel->nir, nir_opt_peephole_select, 8, true, true); + NIR_PASS(progress, sel->nir, nir_opt_peephole_select, 8, true); /* Needed for algebraic lowering */ NIR_PASS(progress, sel->nir, nir_opt_algebraic); diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 89d81c97e18..85103a614b1 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -256,7 +256,7 @@ static void si_emit_cb_render_state(struct si_context *sctx) sx_blend_opt_control); } if (initial_cdw != cs->current.cdw) - sctx->context_roll_counter++; + sctx->context_roll = true; } /* @@ -792,7 +792,7 @@ static void si_emit_clip_regs(struct si_context *sctx) S_028810_CLIP_DISABLE(window_space)); if (initial_cdw != sctx->gfx_cs->current.cdw) - sctx->context_roll_counter++; + sctx->context_roll = true; } /* @@ -1446,7 +1446,7 @@ static void si_emit_db_render_state(struct si_context *sctx) SI_TRACKED_DB_SHADER_CONTROL, db_shader_control); if (initial_cdw != sctx->gfx_cs->current.cdw) - sctx->context_roll_counter++; + sctx->context_roll = true; } /* @@ -3527,7 +3527,7 @@ static void si_emit_msaa_config(struct si_context *sctx) SI_TRACKED_PA_SC_MODE_CNTL_1, sc_mode_cntl_1); if (initial_cdw != cs->current.cdw) { - sctx->context_roll_counter++; + sctx->context_roll = true; /* GFX9: Flush DFSM when the AA mode changes. */ if (sctx->screen->dfsm_allowed) { diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 767e789276a..344f45e7e43 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -224,7 +224,8 @@ static inline unsigned si_atoms_that_always_roll_context(void) SI_ATOM_BIT(scissors) | SI_ATOM_BIT(viewports) | SI_ATOM_BIT(stencil_ref) | - SI_ATOM_BIT(scratch_state)); + SI_ATOM_BIT(scratch_state) | + SI_ATOM_BIT(window_rectangles)); } struct si_shader_data { diff --git a/src/gallium/drivers/radeonsi/si_state_binning.c b/src/gallium/drivers/radeonsi/si_state_binning.c index 3516e561282..5c6c2e69b90 100644 --- a/src/gallium/drivers/radeonsi/si_state_binning.c +++ b/src/gallium/drivers/radeonsi/si_state_binning.c @@ -321,7 +321,7 @@ static void si_emit_dpbb_disable(struct si_context *sctx) S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF) | S_028060_POPS_DRAIN_PS_ON_OVERLAP(1)); if (initial_cdw != sctx->gfx_cs->current.cdw) - sctx->context_roll_counter++; + sctx->context_roll = true; } void si_emit_dpbb_state(struct si_context *sctx) @@ -443,5 +443,5 @@ void si_emit_dpbb_state(struct si_context *sctx) S_028060_PUNCHOUT_MODE(punchout_mode) | S_028060_POPS_DRAIN_PS_ON_OVERLAP(1)); if (initial_cdw != sctx->gfx_cs->current.cdw) - sctx->context_roll_counter++; + sctx->context_roll = true; } diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index c7c02d20d15..7bf82b8b05b 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -66,7 +66,7 @@ static unsigned si_conv_pipe_prim(unsigned mode) * The information about LDS and other non-compile-time parameters is then * written to userdata SGPRs. */ -static bool si_emit_derived_tess_state(struct si_context *sctx, +static void si_emit_derived_tess_state(struct si_context *sctx, const struct pipe_draw_info *info, unsigned *num_patches) { @@ -110,7 +110,7 @@ static bool si_emit_derived_tess_state(struct si_context *sctx, (!has_primid_instancing_bug || (sctx->last_tess_uses_primid == tess_uses_primid))) { *num_patches = sctx->last_num_patches; - return false; + return; } sctx->last_ls = ls_current; @@ -305,9 +305,8 @@ static bool si_emit_derived_tess_state(struct si_context *sctx, ls_hs_config); } sctx->last_ls_hs_config = ls_hs_config; - return true; /* true if the context rolls */ + sctx->context_roll = true; } - return false; } static unsigned si_num_prims_for_vertices(const struct pipe_draw_info *info) @@ -541,7 +540,7 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx, } /* rast_prim is the primitive type after GS. */ -static bool si_emit_rasterizer_prim_state(struct si_context *sctx) +static void si_emit_rasterizer_prim_state(struct si_context *sctx) { struct radeon_cmdbuf *cs = sctx->gfx_cs; enum pipe_prim_type rast_prim = sctx->current_rast_prim; @@ -549,11 +548,11 @@ static bool si_emit_rasterizer_prim_state(struct si_context *sctx) /* Skip this if not rendering lines. */ if (!util_prim_is_lines(rast_prim)) - return false; + return; if (rast_prim == sctx->last_rast_prim && rs->pa_sc_line_stipple == sctx->last_sc_line_stipple) - return false; + return; /* For lines, reset the stipple pattern at each primitive. Otherwise, * reset the stipple pattern at each packet (line strips, line loops). @@ -564,7 +563,7 @@ static bool si_emit_rasterizer_prim_state(struct si_context *sctx) sctx->last_rast_prim = rast_prim; sctx->last_sc_line_stipple = rs->pa_sc_line_stipple; - return true; /* true if the context rolls */ + sctx->context_roll = true; } static void si_emit_vs_state(struct si_context *sctx, @@ -659,6 +658,7 @@ static void si_emit_draw_registers(struct si_context *sctx, radeon_set_context_reg(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, info->restart_index); sctx->last_restart_index = info->restart_index; + sctx->context_roll = true; } } @@ -896,6 +896,10 @@ static void si_emit_surface_sync(struct si_context *sctx, radeon_emit(cs, 0); /* CP_COHER_BASE */ radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ } + + /* ACQUIRE_MEM has an implicit context roll if the current context + * is busy. */ + sctx->context_roll = true; } void si_emit_cache_flush(struct si_context *sctx) @@ -1210,26 +1214,10 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i unsigned skip_atom_mask) { unsigned num_patches = 0; - /* Vega10/Raven scissor bug workaround. When any context register is - * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR - * registers must be written too. - */ - bool handle_scissor_bug = (sctx->family == CHIP_VEGA10 || sctx->family == CHIP_RAVEN) && - !si_is_atom_dirty(sctx, &sctx->atoms.s.scissors); - bool context_roll = false; /* set correctly for GFX9 only */ - context_roll |= si_emit_rasterizer_prim_state(sctx); + si_emit_rasterizer_prim_state(sctx); if (sctx->tes_shader.cso) - context_roll |= si_emit_derived_tess_state(sctx, info, &num_patches); - - if (handle_scissor_bug && - (info->count_from_stream_output || - sctx->dirty_atoms & si_atoms_that_always_roll_context() || - sctx->dirty_states & si_states_that_always_roll_context() || - si_prim_restart_index_changed(sctx, info))) - context_roll = true; - - sctx->context_roll_counter = 0; + si_emit_derived_tess_state(sctx, info, &num_patches); /* Emit state atoms. */ unsigned mask = sctx->dirty_atoms & ~skip_atom_mask; @@ -1252,12 +1240,6 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i } sctx->dirty_states = 0; - if (handle_scissor_bug && - (context_roll || sctx->context_roll_counter)) { - sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1; - sctx->atoms.s.scissors.emit(sctx); - } - /* Emit draw states. */ si_emit_vs_state(sctx, info); si_emit_draw_registers(sctx, info, num_patches); @@ -1456,6 +1438,22 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i if (!si_upload_vertex_buffer_descriptors(sctx)) return; + /* Vega10/Raven scissor bug workaround. When any context register is + * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR + * registers must be written too. + */ + bool has_gfx9_scissor_bug = sctx->screen->has_gfx9_scissor_bug; + unsigned masked_atoms = 0; + + if (has_gfx9_scissor_bug) { + masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.scissors); + + if (info->count_from_stream_output || + sctx->dirty_atoms & si_atoms_that_always_roll_context() || + sctx->dirty_states & si_states_that_always_roll_context()) + sctx->context_roll = true; + } + /* Use optimal packet order based on whether we need to sync the pipeline. */ if (unlikely(sctx->flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB | @@ -1466,8 +1464,6 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i * Then draw and prefetch at the end. This ensures that the time * the CUs are idle is very short. */ - unsigned masked_atoms = 0; - if (unlikely(sctx->flags & SI_CONTEXT_FLUSH_FOR_RENDER_COND)) masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.render_cond); @@ -1481,6 +1477,13 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i if (si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond)) sctx->atoms.s.render_cond.emit(sctx); + + if (has_gfx9_scissor_bug && + (sctx->context_roll || + si_is_atom_dirty(sctx, &sctx->atoms.s.scissors))) { + sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1; + sctx->atoms.s.scissors.emit(sctx); + } sctx->dirty_atoms = 0; si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset); @@ -1505,7 +1508,16 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i if (!si_upload_graphics_shader_descriptors(sctx)) return; - si_emit_all_states(sctx, info, 0); + si_emit_all_states(sctx, info, masked_atoms); + + if (has_gfx9_scissor_bug && + (sctx->context_roll || + si_is_atom_dirty(sctx, &sctx->atoms.s.scissors))) { + sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1; + sctx->atoms.s.scissors.emit(sctx); + } + sctx->dirty_atoms = 0; + si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset); /* Prefetch the remaining shaders after the draw has been @@ -1514,6 +1526,9 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i cik_emit_prefetch_L2(sctx, false); } + /* Clear the context roll flag after the draw call. */ + sctx->context_roll = false; + if (unlikely(sctx->current_saved_cs)) { si_trace_emit(sctx); si_log_draw_state(sctx, sctx->log); diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 9e052e1efce..e76bb49dff8 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -576,7 +576,7 @@ static void si_emit_shader_es(struct si_context *sctx) shader->vgt_vertex_reuse_block_cntl); if (initial_cdw != sctx->gfx_cs->current.cdw) - sctx->context_roll_counter++; + sctx->context_roll = true; } static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader) @@ -825,7 +825,7 @@ static void si_emit_shader_gs(struct si_context *sctx) } if (initial_cdw != sctx->gfx_cs->current.cdw) - sctx->context_roll_counter++; + sctx->context_roll = true; } static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) @@ -1002,7 +1002,7 @@ static void si_emit_shader_vs(struct si_context *sctx) shader->vgt_vertex_reuse_block_cntl); if (initial_cdw != sctx->gfx_cs->current.cdw) - sctx->context_roll_counter++; + sctx->context_roll = true; } /** @@ -1194,7 +1194,7 @@ static void si_emit_shader_ps(struct si_context *sctx) shader->ctx_reg.ps.cb_shader_mask); if (initial_cdw != sctx->gfx_cs->current.cdw) - sctx->context_roll_counter++; + sctx->context_roll = true; } static void si_shader_ps(struct si_shader *shader) @@ -2869,7 +2869,7 @@ static void si_emit_spi_map(struct si_context *sctx) sctx->tracked_regs.spi_ps_input_cntl, num_interp); if (initial_cdw != sctx->gfx_cs->current.cdw) - sctx->context_roll_counter++; + sctx->context_roll = true; } /** diff --git a/src/gallium/drivers/radeonsi/si_state_streamout.c b/src/gallium/drivers/radeonsi/si_state_streamout.c index 2bf6862c89b..2a0a4bef9a2 100644 --- a/src/gallium/drivers/radeonsi/si_state_streamout.c +++ b/src/gallium/drivers/radeonsi/si_state_streamout.c @@ -303,6 +303,7 @@ void si_emit_streamout_end(struct si_context *sctx) * buffer bound. This ensures that the primitives-emitted query * won't increment. */ radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 0); + sctx->context_roll = true; t[i]->buf_filled_size_valid = true; } diff --git a/src/gallium/drivers/radeonsi/si_state_viewport.c b/src/gallium/drivers/radeonsi/si_state_viewport.c index dac90df1c4f..1ec69216841 100644 --- a/src/gallium/drivers/radeonsi/si_state_viewport.c +++ b/src/gallium/drivers/radeonsi/si_state_viewport.c @@ -185,6 +185,16 @@ static void si_emit_guardband(struct si_context *ctx) const unsigned hw_screen_offset_alignment = ctx->chip_class >= VI ? 16 : MAX2(ctx->screen->se_tile_repeat, 16); + /* Indexed by quantization modes */ + static int max_viewport_size[] = {65535, 16383, 4095}; + + /* Ensure that the whole viewport stays representable in + * absolute coordinates. + * See comment in si_set_viewport_states. + */ + assert(vp_as_scissor.maxx <= max_viewport_size[vp_as_scissor.quant_mode] && + vp_as_scissor.maxy <= max_viewport_size[vp_as_scissor.quant_mode]); + hw_screen_offset_x = CLAMP(hw_screen_offset_x, 0, MAX_PA_SU_HARDWARE_SCREEN_OFFSET); hw_screen_offset_y = CLAMP(hw_screen_offset_y, 0, MAX_PA_SU_HARDWARE_SCREEN_OFFSET); @@ -219,7 +229,6 @@ static void si_emit_guardband(struct si_context *ctx) * * The viewport range is [-max_viewport_size/2, max_viewport_size/2]. */ - static unsigned max_viewport_size[] = {65535, 16383, 4095}; assert(vp_as_scissor.quant_mode < ARRAY_SIZE(max_viewport_size)); max_range = max_viewport_size[vp_as_scissor.quant_mode] / 2; left = (-max_range - vp.translate[0]) / vp.scale[0]; @@ -274,7 +283,7 @@ static void si_emit_guardband(struct si_context *ctx) S_028BE4_QUANT_MODE(V_028BE4_X_16_8_FIXED_POINT_1_256TH + vp_as_scissor.quant_mode)); if (initial_cdw != ctx->gfx_cs->current.cdw) - ctx->context_roll_counter++; + ctx->context_roll = true; } static void si_emit_scissors(struct si_context *ctx) @@ -333,6 +342,8 @@ static void si_set_viewport_states(struct pipe_context *pctx, unsigned h = scissor->maxy - scissor->miny; unsigned max_extent = MAX2(w, h); + int max_corner = MAX2(scissor->maxx, scissor->maxy); + unsigned center_x = (scissor->maxx + scissor->minx) / 2; unsigned center_y = (scissor->maxy + scissor->miny) / 2; unsigned max_center = MAX2(center_x, center_y); @@ -358,7 +369,22 @@ static void si_set_viewport_states(struct pipe_context *pctx, if (ctx->family == CHIP_RAVEN) max_extent = 16384; /* Use QUANT_MODE == 16_8. */ - if (max_extent <= 1024) /* 4K scanline area for guardband */ + /* Another constraint is that all coordinates in the viewport + * are representable in fixed point with respect to the + * surface origin. + * + * It means that PA_SU_HARDWARE_SCREEN_OFFSET can't be given + * an offset that would make the upper corner of the viewport + * greater than the maximum representable number post + * quantization, ie 2^quant_bits. + * + * This does not matter for 14.10 and 16.8 formats since the + * offset is already limited at 8k, but it means we can't use + * 12.12 if we are drawing to some pixels outside the lower + * 4k x 4k of the render target. + */ + + if (max_extent <= 1024 && max_corner < 4096) /* 4K scanline area for guardband */ scissor->quant_mode = SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH; else if (max_extent <= 4096) /* 16K scanline area for guardband */ scissor->quant_mode = SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH; diff --git a/src/gallium/drivers/radeonsi/si_test_dma.c b/src/gallium/drivers/radeonsi/si_test_dma.c index 90a2032cd80..7e396e671be 100644 --- a/src/gallium/drivers/radeonsi/si_test_dma.c +++ b/src/gallium/drivers/radeonsi/si_test_dma.c @@ -309,7 +309,7 @@ void si_test_dma(struct si_screen *sscreen) /* clear dst pixels */ uint32_t zero = 0; si_clear_buffer(sctx, dst, 0, sdst->surface.surf_size, &zero, 4, - SI_COHERENCY_SHADER); + SI_COHERENCY_SHADER, false); memset(dst_cpu.ptr, 0, dst_cpu.layer_stride * tdst.array_size); /* preparation */ diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c index 44e48cc7ee4..6931b52dc9f 100644 --- a/src/gallium/drivers/softpipe/sp_screen.c +++ b/src/gallium/drivers/softpipe/sp_screen.c @@ -265,6 +265,8 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param) return 1; case PIPE_CAP_CLEAR_TEXTURE: return 1; + case PIPE_CAP_MAX_VARYINGS: + return TGSI_EXEC_MAX_INPUT_ATTRIBS; case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: diff --git a/src/gallium/drivers/softpipe/sp_setup.c b/src/gallium/drivers/softpipe/sp_setup.c index ffe49260b9a..a91e4f588c8 100644 --- a/src/gallium/drivers/softpipe/sp_setup.c +++ b/src/gallium/drivers/softpipe/sp_setup.c @@ -390,17 +390,6 @@ setup_sort_vertices(struct setup_context *setup, return FALSE; } - - /* Prepare pixel offset for rasterisation: - * - pixel center (0.5, 0.5) for GL, or - * - assume (0.0, 0.0) for other APIs. - */ - if (setup->softpipe->rasterizer->half_pixel_center) { - setup->pixel_offset = 0.5f; - } else { - setup->pixel_offset = 0.0f; - } - return TRUE; } @@ -1476,6 +1465,16 @@ sp_setup_prepare(struct setup_context *setup) } } + /* Prepare pixel offset for rasterisation: + * - pixel center (0.5, 0.5) for GL, or + * - assume (0.0, 0.0) for other APIs. + */ + if (setup->softpipe->rasterizer->half_pixel_center) { + setup->pixel_offset = 0.5f; + } else { + setup->pixel_offset = 0.0f; + } + setup->max_layer = max_layer; sp->quad.first->begin( sp->quad.first ); diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.c b/src/gallium/drivers/softpipe/sp_tile_cache.c index 351736ee421..998939bdf30 100644 --- a/src/gallium/drivers/softpipe/sp_tile_cache.c +++ b/src/gallium/drivers/softpipe/sp_tile_cache.c @@ -373,17 +373,18 @@ sp_tile_cache_flush_clear(struct softpipe_tile_cache *tc, int layer) if (util_format_is_pure_uint(tc->surface->format)) { pipe_put_tile_ui_format(pt, tc->transfer_map[layer], x, y, TILE_SIZE, TILE_SIZE, - pt->resource->format, + tc->surface->format, (unsigned *) tc->tile->data.colorui128); } else if (util_format_is_pure_sint(tc->surface->format)) { pipe_put_tile_i_format(pt, tc->transfer_map[layer], x, y, TILE_SIZE, TILE_SIZE, - pt->resource->format, + tc->surface->format, (int *) tc->tile->data.colori128); } else { - pipe_put_tile_rgba(pt, tc->transfer_map[layer], - x, y, TILE_SIZE, TILE_SIZE, - (float *) tc->tile->data.color); + pipe_put_tile_rgba_format(pt, tc->transfer_map[layer], + x, y, TILE_SIZE, TILE_SIZE, + tc->surface->format, + (float *) tc->tile->data.color); } } numCleared++; diff --git a/src/gallium/drivers/svga/Makefile.sources b/src/gallium/drivers/svga/Makefile.sources index 72024cf60e1..229d2863c84 100644 --- a/src/gallium/drivers/svga/Makefile.sources +++ b/src/gallium/drivers/svga/Makefile.sources @@ -15,8 +15,6 @@ C_SOURCES := \ svga_hw_reg.h \ svga_link.c \ svga_link.h \ - svga_msg.c \ - svga_msg.h \ svga_mksstats.h \ svga_pipe_blend.c \ svga_pipe_blit.c \ diff --git a/src/gallium/drivers/svga/meson.build b/src/gallium/drivers/svga/meson.build index 7981e2991f3..4d3207a9942 100644 --- a/src/gallium/drivers/svga/meson.build +++ b/src/gallium/drivers/svga/meson.build @@ -27,7 +27,6 @@ files_svga = files( 'svga_draw_elements.c', 'svga_format.c', 'svga_link.c', - 'svga_msg.c', 'svga_pipe_blend.c', 'svga_pipe_blit.c', 'svga_pipe_clear.c', diff --git a/src/gallium/drivers/svga/svga_cmd.c b/src/gallium/drivers/svga/svga_cmd.c index 5557d208171..6577c839cf0 100644 --- a/src/gallium/drivers/svga/svga_cmd.c +++ b/src/gallium/drivers/svga/svga_cmd.c @@ -1693,7 +1693,7 @@ SVGA3D_BindGBSurface(struct svga_winsys_context *swc, return PIPE_ERROR_OUT_OF_MEMORY; swc->surface_relocation(swc, &cmd->sid, &cmd->mobid, surface, - SVGA_RELOC_READ | SVGA_RELOC_INTERNAL); + SVGA_RELOC_READ); swc->commit(swc); diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c index 95dde8b0897..f747ff78bcf 100644 --- a/src/gallium/drivers/svga/svga_screen.c +++ b/src/gallium/drivers/svga/svga_screen.c @@ -37,7 +37,6 @@ #include "svga_public.h" #include "svga_context.h" #include "svga_format.h" -#include "svga_msg.h" #include "svga_screen.h" #include "svga_tgsi.h" #include "svga_resource_texture.h" @@ -350,6 +349,8 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS: return sws->have_sm4_1 ? 1 : 0; /* only single-channel textures */ + case PIPE_CAP_MAX_VARYINGS: + return sws->have_vgpu10 ? VGPU10_MAX_FS_INPUTS : 10; /* Unsupported features */ case PIPE_CAP_TEXTURE_MIRROR_CLAMP: @@ -889,17 +890,18 @@ svga_get_driver_query_info(struct pipe_screen *screen, static void init_logging(struct pipe_screen *screen) { + struct svga_screen *svgascreen = svga_screen(screen); static const char *log_prefix = "Mesa: "; char host_log[1000]; /* Log Version to Host */ util_snprintf(host_log, sizeof(host_log) - strlen(log_prefix), - "%s%s", log_prefix, svga_get_name(screen)); - svga_host_log(host_log); + "%s%s\n", log_prefix, svga_get_name(screen)); + svgascreen->sws->host_log(svgascreen->sws, host_log); util_snprintf(host_log, sizeof(host_log) - strlen(log_prefix), "%s" PACKAGE_VERSION MESA_GIT_SHA1, log_prefix); - svga_host_log(host_log); + svgascreen->sws->host_log(svgascreen->sws, host_log); /* If the SVGA_EXTRA_LOGGING env var is set, log the process's command * line (program name and arguments). @@ -908,13 +910,23 @@ init_logging(struct pipe_screen *screen) char cmdline[1000]; if (os_get_command_line(cmdline, sizeof(cmdline))) { util_snprintf(host_log, sizeof(host_log) - strlen(log_prefix), - "%s%s", log_prefix, cmdline); - svga_host_log(host_log); + "%s%s\n", log_prefix, cmdline); + svgascreen->sws->host_log(svgascreen->sws, host_log); } } } +/** + * no-op logging function to use when SVGA_NO_LOGGING is set. + */ +static void +nop_host_log(struct svga_winsys_screen *sws, const char *message) +{ + /* nothing */ +} + + static void svga_destroy_screen( struct pipe_screen *screen ) { @@ -1132,7 +1144,11 @@ svga_screen_create(struct svga_winsys_screen *sws) svga_screen_cache_init(svgascreen); - init_logging(screen); + if (debug_get_bool_option("SVGA_NO_LOGGING", FALSE) == TRUE) { + svgascreen->sws->host_log = nop_host_log; + } else { + init_logging(screen); + } return screen; error2: diff --git a/src/gallium/drivers/svga/svga_winsys.h b/src/gallium/drivers/svga/svga_winsys.h index a067a7ba09d..14782e19a7d 100644 --- a/src/gallium/drivers/svga/svga_winsys.h +++ b/src/gallium/drivers/svga/svga_winsys.h @@ -753,6 +753,11 @@ struct svga_winsys_screen void (*stats_time_pop)(); + /** + * Send a host log message + */ + void + (*host_log)(struct svga_winsys_screen *sws, const char *message); /** Have VGPU v10 hardware? */ boolean have_vgpu10; diff --git a/src/gallium/drivers/swr/swr_screen.cpp b/src/gallium/drivers/swr/swr_screen.cpp index de9008ddf6a..bee011e4abf 100644 --- a/src/gallium/drivers/swr/swr_screen.cpp +++ b/src/gallium/drivers/swr/swr_screen.cpp @@ -369,6 +369,8 @@ swr_get_param(struct pipe_screen *screen, enum pipe_cap param) return 32; case PIPE_CAP_MAX_SHADER_BUFFER_SIZE: return 1 << 27; + case PIPE_CAP_MAX_VARYINGS: + return 32; case PIPE_CAP_VENDOR_ID: return 0xFFFFFFFF; @@ -844,7 +846,9 @@ swr_texture_layout(struct swr_screen *screen, size_t total_size = (uint64_t)res->swr.depth * res->swr.qpitch * res->swr.pitch * res->swr.numSamples; - if (total_size > SWR_MAX_TEXTURE_SIZE) + + // Let non-sampled textures (e.g. buffer objects) bypass the size limit + if (swr_resource_is_texture(&res->base) && total_size > SWR_MAX_TEXTURE_SIZE) return false; if (allocate) { diff --git a/src/gallium/drivers/v3d/v3d_blit.c b/src/gallium/drivers/v3d/v3d_blit.c index 2f36bdd46e3..d42e8fd0e69 100644 --- a/src/gallium/drivers/v3d/v3d_blit.c +++ b/src/gallium/drivers/v3d/v3d_blit.c @@ -491,7 +491,8 @@ v3d_tfu_blit(struct pipe_context *pctx, const struct pipe_blit_info *info) if ((info->mask & PIPE_MASK_RGBA) == 0) return false; - if (info->dst.box.x != 0 || + if (info->scissor_enable || + info->dst.box.x != 0 || info->dst.box.y != 0 || info->dst.box.width != dst_width || info->dst.box.height != dst_height || diff --git a/src/gallium/drivers/v3d/v3d_resource.c b/src/gallium/drivers/v3d/v3d_resource.c index 21c68942e14..84e86799d5e 100644 --- a/src/gallium/drivers/v3d/v3d_resource.c +++ b/src/gallium/drivers/v3d/v3d_resource.c @@ -780,7 +780,7 @@ v3d_resource_create_with_modifiers(struct pipe_screen *pscreen, rsc->tiled = false; } else { fprintf(stderr, "Unsupported modifier requested\n"); - return NULL; + goto fail; } rsc->internal_format = prsc->format; diff --git a/src/gallium/drivers/v3d/v3d_screen.c b/src/gallium/drivers/v3d/v3d_screen.c index bed2c63a64d..17afeebb4fc 100644 --- a/src/gallium/drivers/v3d/v3d_screen.c +++ b/src/gallium/drivers/v3d/v3d_screen.c @@ -70,6 +70,7 @@ v3d_screen_destroy(struct pipe_screen *pscreen) util_hash_table_destroy(screen->bo_handles); v3d_bufmgr_destroy(pscreen); slab_destroy_parent(&screen->transfer_pool); + free(screen->ro); if (using_v3d_simulator) v3d_simulator_destroy(screen); @@ -177,11 +178,17 @@ v3d_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS: return 4; + case PIPE_CAP_MAX_VARYINGS: + return V3D_MAX_FS_INPUTS / 4; + /* Texturing. */ case PIPE_CAP_MAX_TEXTURE_2D_LEVELS: case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS: case PIPE_CAP_MAX_TEXTURE_3D_LEVELS: - return V3D_MAX_MIP_LEVELS; + if (screen->devinfo.ver < 40) + return 12; + else + return V3D_MAX_MIP_LEVELS; case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS: return 2048; diff --git a/src/gallium/drivers/v3d/v3dx_draw.c b/src/gallium/drivers/v3d/v3dx_draw.c index 2700208e388..4b1b03b5db5 100644 --- a/src/gallium/drivers/v3d/v3dx_draw.c +++ b/src/gallium/drivers/v3d/v3dx_draw.c @@ -55,7 +55,28 @@ v3d_start_draw(struct v3d_context *v3d) job->submit.bcl_start = job->bcl.bo->offset; v3d_job_add_bo(job, job->bcl.bo); - job->tile_alloc = v3d_bo_alloc(v3d->screen, 1024 * 1024, "tile_alloc"); + /* The PTB will request the tile alloc initial size per tile at start + * of tile binning. + */ + uint32_t tile_alloc_size = (job->draw_tiles_x * + job->draw_tiles_y) * 64; + /* The PTB allocates in aligned 4k chunks after the initial setup. */ + tile_alloc_size = align(tile_alloc_size, 4096); + + /* Include the first two chunk allocations that the PTB does so that + * we definitely clear the OOM condition before triggering one (the HW + * won't trigger OOM during the first allocations). + */ + tile_alloc_size += 8192; + + /* For performance, allocate some extra initial memory after the PTB's + * minimal allocations, so that we hopefully don't have to block the + * GPU on the kernel handling an OOM signal. + */ + tile_alloc_size += 512 * 1024; + + job->tile_alloc = v3d_bo_alloc(v3d->screen, tile_alloc_size, + "tile_alloc"); uint32_t tsda_per_tile_size = v3d->screen->devinfo.ver >= 40 ? 256 : 64; job->tile_state = v3d_bo_alloc(v3d->screen, job->draw_tiles_y * @@ -203,8 +224,13 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d, * shader needs to write the Z value (even just discards). */ shader.fragment_shader_does_z_writes = - (v3d->prog.fs->prog_data.fs->writes_z || - v3d->prog.fs->prog_data.fs->discard); + v3d->prog.fs->prog_data.fs->writes_z; + /* Set if the EZ test must be disabled (due to shader side + * effects and the early_z flag not being present in the + * shader). + */ + shader.turn_off_early_z_test = + v3d->prog.fs->prog_data.fs->disable_ez; shader.fragment_shader_uses_real_pixel_centre_w_in_addition_to_centroid_w2 = v3d->prog.fs->prog_data.fs->uses_center_w; diff --git a/src/gallium/drivers/v3d/v3dx_state.c b/src/gallium/drivers/v3d/v3dx_state.c index f326b5379ba..eff6bcfca06 100644 --- a/src/gallium/drivers/v3d/v3dx_state.c +++ b/src/gallium/drivers/v3d/v3dx_state.c @@ -846,6 +846,9 @@ v3d_setup_texture_shader_state(struct V3DX(TEXTURE_SHADER_STATE) *tex, prsc->target == PIPE_TEXTURE_1D_ARRAY) { tex->image_height = tex->image_width >> 14; } + + tex->image_width &= (1 << 14) - 1; + tex->image_height &= (1 << 14) - 1; #endif if (prsc->target == PIPE_TEXTURE_3D) { diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index 2d0a52bb5fb..8f1e561c444 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -1591,7 +1591,7 @@ vc4_optimize_nir(struct nir_shader *s) NIR_PASS(progress, s, nir_opt_dce); NIR_PASS(progress, s, nir_opt_dead_cf); NIR_PASS(progress, s, nir_opt_cse); - NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true); + NIR_PASS(progress, s, nir_opt_peephole_select, 8, true); NIR_PASS(progress, s, nir_opt_algebraic); NIR_PASS(progress, s, nir_opt_constant_folding); NIR_PASS(progress, s, nir_opt_undef); diff --git a/src/gallium/drivers/vc4/vc4_query.c b/src/gallium/drivers/vc4/vc4_query.c index 6e4681e93cc..f08785f457f 100644 --- a/src/gallium/drivers/vc4/vc4_query.c +++ b/src/gallium/drivers/vc4/vc4_query.c @@ -132,7 +132,7 @@ vc4_create_batch_query(struct pipe_context *pctx, unsigned num_queries, /* We can't mix HW and non-HW queries. */ if (nhwqueries && nhwqueries != num_queries) - return NULL; + goto err_free_query; if (!nhwqueries) return (struct pipe_query *)query; diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c index e7f7c82c271..acb4a1feb0d 100644 --- a/src/gallium/drivers/vc4/vc4_screen.c +++ b/src/gallium/drivers/vc4/vc4_screen.c @@ -178,6 +178,9 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) /* Note: Not supported in hardware, just faking it. */ return 5; + case PIPE_CAP_MAX_VARYINGS: + return 8; + case PIPE_CAP_VENDOR_ID: return 0x14E4; case PIPE_CAP_ACCELERATED: diff --git a/src/gallium/drivers/virgl/virgl_context.c b/src/gallium/drivers/virgl/virgl_context.c index 08f85f8574a..f9d8e231a13 100644 --- a/src/gallium/drivers/virgl/virgl_context.c +++ b/src/gallium/drivers/virgl/virgl_context.c @@ -765,7 +765,6 @@ static void virgl_flush_from_st(struct pipe_context *ctx, enum pipe_flush_flags flags) { struct virgl_context *vctx = virgl_context(ctx); - struct virgl_screen *rs = virgl_screen(ctx->screen); if (flags & PIPE_FLUSH_FENCE_FD) vctx->cbuf->needs_out_fence_fd = true; diff --git a/src/gallium/drivers/virgl/virgl_screen.c b/src/gallium/drivers/virgl/virgl_screen.c index 42e0987e0c9..17fa5fc51cc 100644 --- a/src/gallium/drivers/virgl/virgl_screen.c +++ b/src/gallium/drivers/virgl/virgl_screen.c @@ -258,6 +258,10 @@ virgl_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_TEXTURE_FLOAT_LINEAR: case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: return 1; /* TODO: need to introduce a hw-cap for this */ + case PIPE_CAP_MAX_VARYINGS: + if (vscreen->caps.caps.v1.glsl_level < 150) + return vscreen->caps.caps.v2.max_vertex_attribs; + return 32; case PIPE_CAP_TEXTURE_GATHER_SM5: case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT: case PIPE_CAP_FAKE_SW_MSAA: diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h index 867d0cb5d74..96e8fbed1be 100644 --- a/src/gallium/include/pipe/p_defines.h +++ b/src/gallium/include/pipe/p_defines.h @@ -856,6 +856,7 @@ enum pipe_cap PIPE_CAP_QUERY_PIPELINE_STATISTICS_SINGLE, PIPE_CAP_RGB_OVERRIDE_DST_ALPHA_BLEND, PIPE_CAP_DEST_SURFACE_SRGB_CONTROL, + PIPE_CAP_MAX_VARYINGS, }; /** diff --git a/src/gallium/include/pipe/p_video_enums.h b/src/gallium/include/pipe/p_video_enums.h index b5b8b062285..adbe7858d0f 100644 --- a/src/gallium/include/pipe/p_video_enums.h +++ b/src/gallium/include/pipe/p_video_enums.h @@ -70,7 +70,8 @@ enum pipe_video_profile PIPE_VIDEO_PROFILE_HEVC_MAIN_444, PIPE_VIDEO_PROFILE_JPEG_BASELINE, PIPE_VIDEO_PROFILE_VP9_PROFILE0, - PIPE_VIDEO_PROFILE_VP9_PROFILE2 + PIPE_VIDEO_PROFILE_VP9_PROFILE2, + PIPE_VIDEO_PROFILE_MAX }; /* Video caps, can be different for each codec/profile */ diff --git a/src/gallium/state_trackers/dri/dri2.c b/src/gallium/state_trackers/dri/dri2.c index ebbbabb6492..930d440a1e2 100644 --- a/src/gallium/state_trackers/dri/dri2.c +++ b/src/gallium/state_trackers/dri/dri2.c @@ -994,11 +994,6 @@ dri2_create_image_common(__DRIscreen *_screen, if (!map) return NULL; - /* createImageWithModifiers doesn't supply usage, and we should not get - * here with both modifiers and a usage flag. - */ - assert(!(use && (modifiers != NULL))); - tex_usage = PIPE_BIND_RENDER_TARGET | PIPE_BIND_SAMPLER_VIEW; if (use & __DRI_IMAGE_USE_SCANOUT) @@ -1071,7 +1066,7 @@ dri2_create_image_with_modifiers(__DRIscreen *dri_screen, void *loaderPrivate) { return dri2_create_image_common(dri_screen, width, height, format, - 0 /* use */, modifiers, count, + __DRI_IMAGE_USE_SHARE, modifiers, count, loaderPrivate); } diff --git a/src/gallium/state_trackers/glx/xlib/meson.build b/src/gallium/state_trackers/glx/xlib/meson.build index f4ee75426bc..34b93c94cf2 100644 --- a/src/gallium/state_trackers/glx/xlib/meson.build +++ b/src/gallium/state_trackers/glx/xlib/meson.build @@ -23,5 +23,5 @@ libxlib = static_library( files('glx_api.c', 'glx_getproc.c', 'glx_usefont.c', 'xm_api.c', 'xm_st.c'), c_args : c_vis_args, include_directories : [inc_common, inc_mapi, inc_mesa], - dependencies : [dep_x11, dep_xext, dep_xcb], + dependencies : [dep_x11, dep_xext, dep_xcb, dep_glproto], ) diff --git a/src/gallium/state_trackers/nine/nine_pipe.h b/src/gallium/state_trackers/nine/nine_pipe.h index 7b68c09c47a..0595da5535a 100644 --- a/src/gallium/state_trackers/nine/nine_pipe.h +++ b/src/gallium/state_trackers/nine/nine_pipe.h @@ -377,6 +377,10 @@ d3dmultisample_type_check(struct pipe_screen *screen, if (levels) *levels = 1; + /* Ignores multisamplequality */ + if (*multisample == D3DMULTISAMPLE_NONE) + return D3D_OK; + if (*multisample == D3DMULTISAMPLE_NONMASKABLE) { if (depth_stencil_format(format)) bind = d3d9_get_pipe_depth_format_bindings(format); diff --git a/src/gallium/state_trackers/nine/swapchain9.c b/src/gallium/state_trackers/nine/swapchain9.c index 6c22be24c7c..8026ee16b7a 100644 --- a/src/gallium/state_trackers/nine/swapchain9.c +++ b/src/gallium/state_trackers/nine/swapchain9.c @@ -133,6 +133,13 @@ D3DWindowBuffer_release(struct NineSwapChain9 *This, D3DWindowBuffer *present_handle) { int i; + + /* IsBufferReleased API not available */ + if (This->base.device->minor_version_num <= 2) { + ID3DPresent_DestroyD3DWindowBuffer(This->present, present_handle); + return; + } + /* Add it to the 'pending release' list */ for (i = 0; i < D3DPRESENT_BACK_BUFFERS_MAX_EX + 1; i++) { if (!This->present_handles_pending_release[i]) { @@ -750,9 +757,19 @@ present( struct NineSwapChain9 *This, if (This->params.SwapEffect == D3DSWAPEFFECT_DISCARD) handle_draw_cursor_and_hud(This, resource); - ID3DPresent_GetWindowInfo(This->present, hDestWindowOverride, &target_width, &target_height, &target_depth); + hr = ID3DPresent_GetWindowInfo(This->present, hDestWindowOverride, &target_width, &target_height, &target_depth); (void)target_depth; + /* Can happen with old Wine (presentation can still succeed), + * or at window destruction. + * Also disable for very old wine as D3DWindowBuffer_release + * cannot do the DestroyD3DWindowBuffer workaround. */ + if (FAILED(hr) || target_width == 0 || target_height == 0 || + This->base.device->minor_version_num <= 2) { + target_width = resource->width0; + target_height = resource->height0; + } + /* Switch to using presentation buffers on window resize. * Note: Most apps should resize the d3d back buffers when * a window resize is detected, which will result in a call to diff --git a/src/gallium/state_trackers/va/context.c b/src/gallium/state_trackers/va/context.c index 14e904ee490..47a5e7be230 100644 --- a/src/gallium/state_trackers/va/context.c +++ b/src/gallium/state_trackers/va/context.c @@ -175,7 +175,7 @@ VA_DRIVER_INIT_FUNC(VADriverContextP ctx) ctx->version_minor = 1; *ctx->vtable = vtable; *ctx->vtable_vpp = vtable_vpp; - ctx->max_profiles = PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH - PIPE_VIDEO_PROFILE_UNKNOWN; + ctx->max_profiles = PIPE_VIDEO_PROFILE_MAX - PIPE_VIDEO_PROFILE_UNKNOWN - 1; ctx->max_entrypoints = 2; ctx->max_attributes = 1; ctx->max_image_formats = VL_VA_MAX_IMAGE_FORMATS; diff --git a/src/gallium/state_trackers/va/picture_mpeg12.c b/src/gallium/state_trackers/va/picture_mpeg12.c index 1e5a9c7428d..daf95f7403c 100644 --- a/src/gallium/state_trackers/va/picture_mpeg12.c +++ b/src/gallium/state_trackers/va/picture_mpeg12.c @@ -27,6 +27,19 @@ #include "va_private.h" +const int reverse_inverse_zscan[] = +{ + /* Reverse inverse z scan pattern */ + 0, 2, 3, 9, 10, 20, 21, 35, + 1, 4, 8, 11, 19, 22, 34, 36, + 5, 7, 12, 18, 23, 33, 37, 48, + 6, 13, 17, 24, 32, 38, 47, 49, + 14, 16, 25, 31, 39, 46, 50, 57, + 15, 26, 30, 40, 45, 51, 56, 58, + 27, 29, 41, 44, 52, 55, 59, 62, + 28, 42, 43, 53, 54, 60, 61, 63, +}; + void vlVaHandlePictureParameterBufferMPEG12(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf) { VAPictureParameterBufferMPEG2 *mpeg2 = buf->data; @@ -66,16 +79,29 @@ void vlVaHandlePictureParameterBufferMPEG12(vlVaDriver *drv, vlVaContext *contex void vlVaHandleIQMatrixBufferMPEG12(vlVaContext *context, vlVaBuffer *buf) { VAIQMatrixBufferMPEG2 *mpeg2 = buf->data; + static uint8_t temp_intra_matrix[64]; + static uint8_t temp_nonintra_matrix[64]; assert(buf->size >= sizeof(VAIQMatrixBufferMPEG2) && buf->num_elements == 1); - if (mpeg2->load_intra_quantiser_matrix) - context->desc.mpeg12.intra_matrix = mpeg2->intra_quantiser_matrix; - else + if (mpeg2->load_intra_quantiser_matrix) { + /* The quantiser matrix that VAAPI provides has been applied + with inverse z-scan. However, what we expect in MPEG2 + picture description is the original order. Therefore, + we need to reverse it back to its original order. + */ + for (int i = 0; i < 64; i++) + temp_intra_matrix[i] = + mpeg2->intra_quantiser_matrix[reverse_inverse_zscan[i]]; + context->desc.mpeg12.intra_matrix = temp_intra_matrix; + } else context->desc.mpeg12.intra_matrix = NULL; - if (mpeg2->load_non_intra_quantiser_matrix) - context->desc.mpeg12.non_intra_matrix = mpeg2->non_intra_quantiser_matrix; - else + if (mpeg2->load_non_intra_quantiser_matrix) { + for (int i = 0; i < 64; i++) + temp_nonintra_matrix[i] = + mpeg2->non_intra_quantiser_matrix[reverse_inverse_zscan[i]]; + context->desc.mpeg12.non_intra_matrix = temp_nonintra_matrix; + } else context->desc.mpeg12.non_intra_matrix = NULL; } diff --git a/src/gallium/state_trackers/va/picture_vp9.c b/src/gallium/state_trackers/va/picture_vp9.c index c1ca54cd008..b5aca9a513c 100644 --- a/src/gallium/state_trackers/va/picture_vp9.c +++ b/src/gallium/state_trackers/va/picture_vp9.c @@ -28,6 +28,8 @@ #include "vl/vl_vlc.h" #include "va_private.h" +#define NUM_VP9_REFS 8 + void vlVaHandlePictureParameterBufferVP9(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf) { VADecPictureParameterBufferVP9 *vp9 = buf->data; @@ -79,8 +81,11 @@ void vlVaHandlePictureParameterBufferVP9(vlVaDriver *drv, vlVaContext *context, context->desc.vp9.picture_parameter.bit_depth = vp9->bit_depth; - for (i = 0 ; i < 8 ; i++) + for (i = 0 ; i < NUM_VP9_REFS ; i++) vlVaGetReferenceFrame(drv, vp9->reference_frames[i], &context->desc.vp9.ref[i]); + + if (!context->decoder && !context->templat.max_references) + context->templat.max_references = NUM_VP9_REFS; } void vlVaHandleSliceParameterBufferVP9(vlVaContext *context, vlVaBuffer *buf) diff --git a/src/gallium/state_trackers/xvmc/attributes.c b/src/gallium/state_trackers/xvmc/attributes.c index 375705669b0..6e4d78a9a29 100644 --- a/src/gallium/state_trackers/xvmc/attributes.c +++ b/src/gallium/state_trackers/xvmc/attributes.c @@ -90,15 +90,15 @@ Status XvMCSetAttribute(Display *dpy, XvMCContext *context, Atom attribute, int if (!attr) return XvMCBadContext; - if (strcmp(attr, XV_BRIGHTNESS)) + if (strcmp(attr, XV_BRIGHTNESS) == 0) context_priv->procamp.brightness = value / 1000.0f; - else if (strcmp(attr, XV_CONTRAST)) + else if (strcmp(attr, XV_CONTRAST) == 0) context_priv->procamp.contrast = value / 1000.0f + 1.0f; - else if (strcmp(attr, XV_SATURATION)) + else if (strcmp(attr, XV_SATURATION) == 0) context_priv->procamp.saturation = value / 1000.0f + 1.0f; - else if (strcmp(attr, XV_HUE)) + else if (strcmp(attr, XV_HUE) == 0) context_priv->procamp.hue = value / 1000.0f; - else if (strcmp(attr, XV_COLORSPACE)) + else if (strcmp(attr, XV_COLORSPACE) == 0) context_priv->color_standard = value ? VL_CSC_COLOR_STANDARD_BT_601 : VL_CSC_COLOR_STANDARD_BT_709; @@ -134,15 +134,15 @@ Status XvMCGetAttribute(Display *dpy, XvMCContext *context, Atom attribute, int if (!attr) return XvMCBadContext; - if (strcmp(attr, XV_BRIGHTNESS)) + if (strcmp(attr, XV_BRIGHTNESS) == 0) *value = context_priv->procamp.brightness * 1000; - else if (strcmp(attr, XV_CONTRAST)) + else if (strcmp(attr, XV_CONTRAST) == 0) *value = context_priv->procamp.contrast * 1000 - 1000; - else if (strcmp(attr, XV_SATURATION)) + else if (strcmp(attr, XV_SATURATION) == 0) *value = context_priv->procamp.saturation * 1000 + 1000; - else if (strcmp(attr, XV_HUE)) + else if (strcmp(attr, XV_HUE) == 0) *value = context_priv->procamp.hue * 1000; - else if (strcmp(attr, XV_COLORSPACE)) + else if (strcmp(attr, XV_COLORSPACE) == 0) *value = context_priv->color_standard == VL_CSC_COLOR_STANDARD_BT_709; else return BadName; diff --git a/src/gallium/state_trackers/xvmc/tests/xvmc_bench.c b/src/gallium/state_trackers/xvmc/tests/xvmc_bench.c index 3cd23173c7c..dbd705639f6 100644 --- a/src/gallium/state_trackers/xvmc/tests/xvmc_bench.c +++ b/src/gallium/state_trackers/xvmc/tests/xvmc_bench.c @@ -123,11 +123,11 @@ void ParseArgs(int argc, char **argv, struct Config *config) while (token && !fail) { - if (strcmp(token, "i")) + if (strcmp(token, "i") == 0) config->mb_types |= MB_TYPE_I; - else if (strcmp(token, "p")) + else if (strcmp(token, "p") == 0) config->mb_types |= MB_TYPE_P; - else if (strcmp(token, "b")) + else if (strcmp(token, "b") == 0) config->mb_types |= MB_TYPE_B; else fail = 1; diff --git a/src/gallium/targets/d3dadapter9/meson.build b/src/gallium/targets/d3dadapter9/meson.build index bc72b1110a0..b3c31c5dc6d 100644 --- a/src/gallium/targets/d3dadapter9/meson.build +++ b/src/gallium/targets/d3dadapter9/meson.build @@ -68,5 +68,5 @@ pkg.generate( description : 'Native D3D driver modules', version : '.'.join(nine_version), requires_private : 'libdrm >= ' + dep_libdrm.version(), - variables : ['moduledir=${prefix}/@0@'.format(d3d_drivers_path)], + variables : ['moduledir=@0@'.format(d3d_drivers_path)], ) diff --git a/src/gallium/targets/dri/Android.mk b/src/gallium/targets/dri/Android.mk index 9c43fa1e8fd..6134251b5ca 100644 --- a/src/gallium/targets/dri/Android.mk +++ b/src/gallium/targets/dri/Android.mk @@ -40,12 +40,23 @@ LOCAL_LDFLAGS := \ -Wl,--undefined-version LOCAL_SHARED_LIBRARIES := \ - libbacktrace \ libdl \ libglapi \ - libexpat \ libz +# If Android version >=8 MESA should static link libexpat else should dynamic link +ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0) +LOCAL_STATIC_LIBRARIES := \ + libexpat +else +LOCAL_SHARED_LIBRARIES += \ + libexpat +endif + +ifeq ($(USE_LIBBACKTRACE),true) + LOCAL_SHARED_LIBRARIES += libbacktrace +endif + $(foreach d, $(MESA_BUILD_GALLIUM), $(eval LOCAL_CFLAGS += $(patsubst HAVE_%,-D%,$(d)))) # sort GALLIUM_LIBS to remove any duplicates diff --git a/src/gallium/targets/dri/meson.build b/src/gallium/targets/dri/meson.build index 68d226621b2..edd0c007e48 100644 --- a/src/gallium/targets/dri/meson.build +++ b/src/gallium/targets/dri/meson.build @@ -60,6 +60,10 @@ libgallium_dri = shared_library( driver_tegra, driver_i915, driver_svga, driver_virgl, driver_swr, ], + # Will be deleted during installation, see install_megadrivers.py + install : true, + install_dir : dri_drivers_path, + name_suffix : 'so', ) foreach d : [[with_gallium_kmsro, 'pl111_dri.so'], diff --git a/src/gallium/targets/omx/meson.build b/src/gallium/targets/omx/meson.build index 6811e6ff904..7772ae47bb5 100644 --- a/src/gallium/targets/omx/meson.build +++ b/src/gallium/targets/omx/meson.build @@ -32,7 +32,7 @@ endif libomx_gallium = shared_library( 'omx_mesa', - 'target.c', + ['target.c', xmlpool_options_h], c_args : c_vis_args, cpp_args : cpp_vis_args, link_args : [omx_link_args, ld_args_gc_sections], diff --git a/src/gallium/targets/osmesa/meson.build b/src/gallium/targets/osmesa/meson.build index b4ae8f4b6ec..e873e311aa0 100644 --- a/src/gallium/targets/osmesa/meson.build +++ b/src/gallium/targets/osmesa/meson.build @@ -43,9 +43,9 @@ libosmesa = shared_library( inc_gallium_drivers, ], link_depends : osmesa_link_deps, - link_whole : [libosmesa_st], + link_whole : [libosmesa_st, libglapi_static], link_with : [ - libmesa_gallium, libgallium, libglapi_static, libws_null, osmesa_link_with, + libmesa_gallium, libgallium, libws_null, osmesa_link_with, ], dependencies : [ dep_selinux, dep_thread, dep_clock, dep_unwind, diff --git a/src/gallium/targets/va/meson.build b/src/gallium/targets/va/meson.build index ded689b464d..4bfb5cbab7a 100644 --- a/src/gallium/targets/va/meson.build +++ b/src/gallium/targets/va/meson.build @@ -33,7 +33,7 @@ endif libva_gallium = shared_library( 'gallium_drv_video', - 'target.c', + ['target.c', xmlpool_options_h], c_args : c_vis_args, cpp_args : cpp_vis_args, link_args : [va_link_args, ld_args_gc_sections], @@ -49,8 +49,10 @@ libva_gallium = shared_library( dep_libdrm, dep_thread, driver_r600, driver_radeonsi, driver_nouveau, ], link_depends : va_link_depends, + # Will be deleted during installation, see install_megadrivers.py install : true, install_dir : va_drivers_path, + name_suffix : 'so', ) foreach d : [[with_gallium_r600, 'r600'], diff --git a/src/gallium/targets/vdpau/meson.build b/src/gallium/targets/vdpau/meson.build index 22e3f5ffdd8..48f01ffba6c 100644 --- a/src/gallium/targets/vdpau/meson.build +++ b/src/gallium/targets/vdpau/meson.build @@ -38,7 +38,7 @@ endif libvdpau_gallium = shared_library( 'vdpau_gallium', - 'target.c', + ['target.c', xmlpool_options_h], c_args : c_vis_args, cpp_args : cpp_vis_args, link_args : [vdpau_link_args, ld_args_gc_sections], @@ -55,6 +55,10 @@ libvdpau_gallium = shared_library( ], link_depends : vdpau_link_depends, soversion : '@0@.@1@.0'.format(VDPAU_MAJOR, VDPAU_MINOR), + # Will be deleted during installation, see install_megadrivers.py + install : true, + install_dir : vdpau_drivers_path, + name_suffix : 'so', ) foreach d : [[with_gallium_r300, 'r300'], [with_gallium_r600, 'r600'], diff --git a/src/gallium/targets/xa/meson.build b/src/gallium/targets/xa/meson.build index 733ef54ff85..582d5ef67f6 100644 --- a/src/gallium/targets/xa/meson.build +++ b/src/gallium/targets/xa/meson.build @@ -34,7 +34,7 @@ _xa_version = '.'.join(xa_version) libxatracker = shared_library( 'xatracker', - 'target.c', + ['target.c', xmlpool_options_h], c_args : c_vis_args, cpp_args : cpp_vis_args, link_args : [xa_link_args, ld_args_gc_sections], diff --git a/src/gallium/targets/xvmc/meson.build b/src/gallium/targets/xvmc/meson.build index 0af5b6477ce..537275aab57 100644 --- a/src/gallium/targets/xvmc/meson.build +++ b/src/gallium/targets/xvmc/meson.build @@ -33,7 +33,7 @@ endif libxvmc_gallium = shared_library( 'XvMCgallium', - 'target.c', + ['target.c', xmlpool_options_h], c_args : c_vis_args, cpp_args : cpp_vis_args, link_args : [xvmc_link_args, ld_args_gc_sections], @@ -47,6 +47,10 @@ libxvmc_gallium = shared_library( ], dependencies : [dep_thread, driver_r600, driver_nouveau], link_depends : xvmc_link_depends, + # Will be deleted during installation, see install_megadrivers.py + install : true, + install_dir : xvmc_drivers_path, + name_suffix : 'so', ) foreach d : [[with_gallium_r600, 'r600'], [with_gallium_nouveau, 'nouveau']] diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c index b4e62acbae4..2e595e5a1b0 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c @@ -386,7 +386,8 @@ static bool amdgpu_cs_has_user_fence(struct amdgpu_cs_context *cs) cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCE && cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_UVD_ENC && cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCN_DEC && - cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCN_ENC; + cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCN_ENC && + cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCN_JPEG; } static bool amdgpu_cs_has_chaining(struct amdgpu_cs *cs) @@ -1219,8 +1220,6 @@ static void amdgpu_add_fence_dependencies_bo_lists(struct amdgpu_cs *acs) { struct amdgpu_cs_context *cs = acs->csc; - cs->num_fence_dependencies = 0; - amdgpu_add_fence_dependencies_bo_list(acs, cs->fence, cs->num_real_buffers, cs->real_buffers); amdgpu_add_fence_dependencies_bo_list(acs, cs->fence, cs->num_slab_buffers, cs->slab_buffers); amdgpu_add_fence_dependencies_bo_list(acs, cs->fence, cs->num_sparse_buffers, cs->sparse_buffers); diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c index 79d2c1345ef..45e54b4791d 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c @@ -92,6 +92,10 @@ static bool do_winsys_init(struct amdgpu_winsys *ws, if (!ac_query_gpu_info(fd, ws->dev, &ws->info, &ws->amdinfo)) goto fail; + /* TODO: Enable this once the kernel handles it efficiently. */ + if (ws->info.has_dedicated_vram) + ws->info.has_local_buffers = false; + handle_env_var_force_family(ws); ws->addrlib = amdgpu_addr_create(&ws->info, &ws->amdinfo, &ws->info.max_alignment); diff --git a/src/gallium/winsys/svga/drm/Makefile.sources b/src/gallium/winsys/svga/drm/Makefile.sources index f82b0097b5b..191f0b88b4a 100644 --- a/src/gallium/winsys/svga/drm/Makefile.sources +++ b/src/gallium/winsys/svga/drm/Makefile.sources @@ -8,6 +8,8 @@ C_SOURCES := \ vmw_fence.c \ vmw_fence.h \ vmwgfx_drm.h \ + vmw_msg.c \ + vmw_msg.h \ vmw_screen.c \ vmw_screen_dri.c \ vmw_screen.h \ diff --git a/src/gallium/winsys/svga/drm/meson.build b/src/gallium/winsys/svga/drm/meson.build index 24f67aca9ec..55266ce1623 100644 --- a/src/gallium/winsys/svga/drm/meson.build +++ b/src/gallium/winsys/svga/drm/meson.build @@ -23,6 +23,7 @@ files_svgadrm = files( 'vmw_buffer.c', 'vmw_context.c', 'vmw_fence.c', + 'vmw_msg.c', 'vmw_screen.c', 'vmw_screen_dri.c', 'vmw_screen_ioctl.c', diff --git a/src/gallium/drivers/svga/svga_msg.c b/src/gallium/winsys/svga/drm/vmw_msg.c old mode 100755 new mode 100644 similarity index 93% rename from src/gallium/drivers/svga/svga_msg.c rename to src/gallium/winsys/svga/drm/vmw_msg.c index 8b63132cb57..8cce2241f36 --- a/src/gallium/drivers/svga/svga_msg.c +++ b/src/gallium/winsys/svga/drm/vmw_msg.c @@ -29,7 +29,8 @@ #include "util/u_memory.h" #include "util/u_string.h" #include "pipe/p_defines.h" -#include "svga_msg.h" +#include "svga_winsys.h" +#include "vmw_msg.h" #define MESSAGE_STATUS_SUCCESS 0x0001 @@ -83,7 +84,7 @@ port_num, magic, \ ax, bx, cx, dx, si, di) \ ({ \ - __asm__ volatile ("inl %%dx, %%eax;" : \ + __asm__ volatile ("inl %%dx, %%eax;" : \ "=a"(ax), \ "=b"(bx), \ "=c"(cx), \ @@ -128,7 +129,7 @@ typedef uint64_t VMW_REG; port_num, magic, bp, \ ax, bx, cx, dx, si, di) \ ({ \ - __asm__ volatile ("push %%rbp;" \ + __asm__ volatile ("push %%rbp;" \ "movq %12, %%rbp;" \ "rep outsb;" \ "pop %%rbp;" : \ @@ -152,7 +153,7 @@ typedef uint64_t VMW_REG; port_num, magic, bp, \ ax, bx, cx, dx, si, di) \ ({ \ - __asm__ volatile ("push %%rbp;" \ + __asm__ volatile ("push %%rbp;" \ "movq %12, %%rbp;" \ "rep insb;" \ "pop %%rbp" : \ @@ -183,7 +184,7 @@ typedef uint32_t VMW_REG; port_num, magic, bp, \ ax, bx, cx, dx, si, di) \ ({ \ - __asm__ volatile ("push %%ebp;" \ + __asm__ volatile ("push %%ebp;" \ "mov %12, %%ebp;" \ "rep outsb;" \ "pop %%ebp;" : \ @@ -208,7 +209,7 @@ typedef uint32_t VMW_REG; port_num, magic, bp, \ ax, bx, cx, dx, si, di) \ ({ \ - __asm__ volatile ("push %%ebp;" \ + __asm__ volatile ("push %%ebp;" \ "mov %12, %%ebp;" \ "rep insb;" \ "pop %%ebp" : \ @@ -252,7 +253,7 @@ typedef uint32_t VMW_REG; (void) in_cx; (void) bp; \ (void) ax; (void) bx; (void) cx; \ (void) dx; (void) si; (void) di; - + #define VMW_PORT_HB_IN(cmd, in_cx, in_si, in_di, \ port_num, magic, bp, \ @@ -283,7 +284,7 @@ struct rpc_channel { /** - * svga_open_channel + * vmw_open_channel * * @channel: RPC channel * @protocol: @@ -291,7 +292,7 @@ struct rpc_channel { * Returns: PIPE_OK on success, PIPE_ERROR otherwise */ static enum pipe_error -svga_open_channel(struct rpc_channel *channel, unsigned protocol) +vmw_open_channel(struct rpc_channel *channel, unsigned protocol) { VMW_REG ax = 0, bx = 0, cx = 0, dx = 0, si = 0, di = 0; @@ -321,7 +322,7 @@ svga_open_channel(struct rpc_channel *channel, unsigned protocol) * Returns: PIPE_OK on success, PIPE_ERROR otherwises */ static enum pipe_error -svga_close_channel(struct rpc_channel *channel) +vmw_close_channel(struct rpc_channel *channel) { VMW_REG ax = 0, bx = 0, cx = 0, dx = 0, si, di; @@ -344,7 +345,7 @@ svga_close_channel(struct rpc_channel *channel) /** - * svga_send_msg: Sends a message to the host + * vmw_send_msg: Sends a message to the host * * @channel: RPC channel * @logmsg: NULL terminated string @@ -352,7 +353,7 @@ svga_close_channel(struct rpc_channel *channel) * Returns: PIPE_OK on success */ static enum pipe_error -svga_send_msg(struct rpc_channel *channel, const char *msg) +vmw_send_msg(struct rpc_channel *channel, const char *msg) { VMW_REG ax = 0, bx = 0, cx = 0, dx = 0, si, di, bp; size_t msg_len = strlen(msg); @@ -406,46 +407,42 @@ svga_send_msg(struct rpc_channel *channel, const char *msg) /** - * svga_host_log: Sends a log message to the host + * vmw_svga_winsys_host_log: Sends a log message to the host * * @log: NULL terminated string * - * Returns: PIPE_OK on success */ -enum pipe_error -svga_host_log(const char *log) +void +vmw_svga_winsys_host_log(struct svga_winsys_screen *sws, const char *log) { struct rpc_channel channel; char *msg; int msg_len; - enum pipe_error ret = PIPE_OK; #ifdef MSG_NOT_IMPLEMENTED - return ret; + return; #endif if (!log) - return ret; + return; msg_len = strlen(log) + strlen("log ") + 1; msg = CALLOC(1, msg_len); if (msg == NULL) { debug_printf("Cannot allocate memory for log message\n"); - return PIPE_ERROR_OUT_OF_MEMORY; + return; } util_sprintf(msg, "log %s", log); - if (svga_open_channel(&channel, RPCI_PROTOCOL_NUM) || - svga_send_msg(&channel, msg) || - svga_close_channel(&channel)) { + if (vmw_open_channel(&channel, RPCI_PROTOCOL_NUM) || + vmw_send_msg(&channel, msg) || + vmw_close_channel(&channel)) { debug_printf("Failed to send log\n"); - - ret = PIPE_ERROR; } FREE(msg); - return ret; + return; } diff --git a/src/gallium/drivers/svga/svga_msg.h b/src/gallium/winsys/svga/drm/vmw_msg.h similarity index 89% rename from src/gallium/drivers/svga/svga_msg.h rename to src/gallium/winsys/svga/drm/vmw_msg.h index 9132ba7e240..57057f23638 100644 --- a/src/gallium/drivers/svga/svga_msg.h +++ b/src/gallium/winsys/svga/drm/vmw_msg.h @@ -26,17 +26,16 @@ * Author: * Sinclair Yeh */ -#ifndef _SVGA_MSG_H -#define _SVGA_MSG_H +#ifndef _VMW_MSG_H +#define _VMW_MSG_H /** - * svga_host_log: Sends a log message to the host + * vmw_host_log: Sends a log message to the host * * @log: NULL terminated string * - * Returns: PIPE_OK on success */ -enum pipe_error svga_host_log(const char *log); +void vmw_svga_winsys_host_log(struct svga_winsys_screen *sws, const char *log); #endif diff --git a/src/gallium/winsys/svga/drm/vmw_screen_svga.c b/src/gallium/winsys/svga/drm/vmw_screen_svga.c index a6990414e20..cd3f21f6033 100644 --- a/src/gallium/winsys/svga/drm/vmw_screen_svga.c +++ b/src/gallium/winsys/svga/drm/vmw_screen_svga.c @@ -48,6 +48,7 @@ #include "vmw_surface.h" #include "vmw_buffer.h" #include "vmw_fence.h" +#include "vmw_msg.h" #include "vmw_shader.h" #include "vmw_query.h" #include "svga3d_surfacedefs.h" @@ -509,6 +510,8 @@ vmw_winsys_screen_init_svga(struct vmw_winsys_screen *vws) vws->base.stats_time_push = vmw_svga_winsys_stats_time_push; vws->base.stats_time_pop = vmw_svga_winsys_stats_time_pop; + vws->base.host_log = vmw_svga_winsys_host_log; + return TRUE; } diff --git a/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c b/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c index 8753139107c..a4c1d50453b 100644 --- a/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c +++ b/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c @@ -396,6 +396,7 @@ xlib_displaytarget_create(struct sw_winsys *winsys, { struct xlib_displaytarget *xlib_dt; unsigned nblocksy, size; + int ignore; xlib_dt = CALLOC_STRUCT(xlib_displaytarget); if (!xlib_dt) @@ -410,7 +411,8 @@ xlib_displaytarget_create(struct sw_winsys *winsys, xlib_dt->stride = align(util_format_get_stride(format, width), alignment); size = xlib_dt->stride * nblocksy; - if (!debug_get_option_xlib_no_shm()) { + if (!debug_get_option_xlib_no_shm() && + XQueryExtension(xlib_dt->display, "MIT-SHM", &ignore, &ignore, &ignore)) { xlib_dt->data = alloc_shm(xlib_dt, size); if (xlib_dt->data) { xlib_dt->shm = True; diff --git a/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c b/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c index 26de8c702df..a2d232a539c 100644 --- a/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c +++ b/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c @@ -46,7 +46,7 @@ #define VIRGL_DRM_VERSION(major, minor) ((major) << 16 | (minor)) -#define VIRGL_DRM_VERSION_FENCE_FD VIRGL_DRM_VERSION(1, 0) +#define VIRGL_DRM_VERSION_FENCE_FD VIRGL_DRM_VERSION(0, 1) static inline boolean can_cache_resource(struct virgl_hw_res *res) @@ -870,7 +870,7 @@ static int virgl_drm_get_version(int fd) else if (version->version_major != 0) ret = -EINVAL; else - ret = version->version_minor; + ret = VIRGL_DRM_VERSION(0, version->version_minor); drmFreeVersion(version); diff --git a/src/glx/dri3_glx.c b/src/glx/dri3_glx.c index 298adc80ef1..d53fc87e21e 100644 --- a/src/glx/dri3_glx.c +++ b/src/glx/dri3_glx.c @@ -642,7 +642,6 @@ dri3_set_swap_interval(__GLXDRIdrawable *pdraw, int interval) break; } - priv->swap_interval = interval; loader_dri3_set_swap_interval(&priv->loader_drawable, interval); return 0; @@ -659,7 +658,7 @@ dri3_get_swap_interval(__GLXDRIdrawable *pdraw) struct dri3_drawable *priv = (struct dri3_drawable *) pdraw; - return priv->swap_interval; + return priv->loader_drawable.swap_interval; } static void diff --git a/src/glx/dri3_priv.h b/src/glx/dri3_priv.h index 1d3c03f9997..32a8d3f7e7d 100644 --- a/src/glx/dri3_priv.h +++ b/src/glx/dri3_priv.h @@ -117,7 +117,6 @@ struct dri3_context struct dri3_drawable { __GLXDRIdrawable base; struct loader_dri3_drawable loader_drawable; - int swap_interval; /* LIBGL_SHOW_FPS support */ uint64_t previous_ust; diff --git a/src/glx/drisw_glx.c b/src/glx/drisw_glx.c index 00c7fa100ab..48c03ca42e0 100644 --- a/src/glx/drisw_glx.c +++ b/src/glx/drisw_glx.c @@ -147,6 +147,9 @@ XDestroyDrawable(struct drisw_drawable * pdp, Display * dpy, XID drawable) if (pdp->ximage) XDestroyImage(pdp->ximage); + if (pdp->shminfo.shmid > 0) + XShmDetach(dpy, &pdp->shminfo); + free(pdp->visinfo); XFreeGC(dpy, pdp->gc); diff --git a/src/intel/Android.common.mk b/src/intel/Android.common.mk index 12cea6e5472..79d9f1284a0 100644 --- a/src/intel/Android.common.mk +++ b/src/intel/Android.common.mk @@ -38,7 +38,17 @@ LOCAL_C_INCLUDES := \ $(MESA_TOP)/src/mapi \ $(MESA_TOP)/src/mesa -LOCAL_SHARED_LIBRARIES := libexpat libz +LOCAL_SHARED_LIBRARIES := libz liblog + +# If Android version >=8 MESA should static link libexpat else should dynamic link +ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0) +LOCAL_STATIC_LIBRARIES := \ + libexpat +else +LOCAL_SHARED_LIBRARIES += \ + libexpat +endif + LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_genxml diff --git a/src/intel/Android.compiler.mk b/src/intel/Android.compiler.mk index c2b01221dfc..41af7b20b9c 100644 --- a/src/intel/Android.compiler.mk +++ b/src/intel/Android.compiler.mk @@ -28,7 +28,7 @@ # --------------------------------------- include $(CLEAR_VARS) - +LOCAL_CFLAGS += -Wno-error LOCAL_MODULE := libmesa_intel_compiler LOCAL_MODULE_CLASS := STATIC_LIBRARIES diff --git a/src/intel/Android.dev.mk b/src/intel/Android.dev.mk index cd2ed66a176..3011ee232ed 100644 --- a/src/intel/Android.dev.mk +++ b/src/intel/Android.dev.mk @@ -33,5 +33,8 @@ LOCAL_C_INCLUDES := $(MESA_TOP)/include/drm-uapi LOCAL_SRC_FILES := $(DEV_FILES) +LOCAL_CFLAGS := \ + -Wno-gnu-variable-sized-type-not-at-end + include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) diff --git a/src/intel/Android.vulkan.mk b/src/intel/Android.vulkan.mk index 7019c8cbc8f..73586803552 100644 --- a/src/intel/Android.vulkan.mk +++ b/src/intel/Android.vulkan.mk @@ -23,9 +23,10 @@ LOCAL_PATH := $(call my-dir) include $(CLEAR_VARS) include $(LOCAL_PATH)/Makefile.sources -VK_ENTRYPOINTS_SCRIPT := $(MESA_PYTHON2) $(LOCAL_PATH)/vulkan/anv_entrypoints_gen.py - -VK_EXTENSIONS_SCRIPT := $(MESA_PYTHON2) $(LOCAL_PATH)/vulkan/anv_extensions_gen.py +ANV_ENTRYPOINTS_GEN_SCRIPT := $(LOCAL_PATH)/vulkan/anv_entrypoints_gen.py +ANV_EXTENSIONS_GEN_SCRIPT := $(LOCAL_PATH)/vulkan/anv_extensions_gen.py +ANV_EXTENSIONS_SCRIPT := $(LOCAL_PATH)/vulkan/anv_extensions.py +VULKAN_API_XML := $(MESA_TOP)/src/vulkan/registry/vk.xml VULKAN_COMMON_INCLUDES := \ $(MESA_TOP)/include \ @@ -41,6 +42,18 @@ VULKAN_COMMON_INCLUDES := \ $(MESA_TOP)/src/compiler \ frameworks/native/vulkan/include +ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0) +VULKAN_COMMON_INCLUDES += \ + frameworks/native/vulkan/include \ + frameworks/native/libs/nativebase/include \ + frameworks/native/libs/nativewindow/include \ + frameworks/native/libs/arect/include + +VULKAN_COMMON_HEADER_LIBRARIES := \ + libcutils_headers \ + libhardware_headers +endif + # libmesa_anv_entrypoints with header and dummy.c # # This static library is built to pull entrypoints header @@ -59,16 +72,28 @@ LOCAL_C_INCLUDES := \ LOCAL_GENERATED_SOURCES += $(intermediates)/vulkan/anv_entrypoints.h LOCAL_GENERATED_SOURCES += $(intermediates)/vulkan/dummy.c +LOCAL_GENERATED_SOURCES += $(intermediates)/vulkan/anv_extensions.h $(intermediates)/vulkan/dummy.c: @mkdir -p $(dir $@) @echo "Gen Dummy: $(PRIVATE_MODULE) <= $(notdir $(@))" $(hide) touch $@ -$(intermediates)/vulkan/anv_entrypoints.h: $(intermediates)/vulkan/dummy.c - $(VK_ENTRYPOINTS_SCRIPT) \ +$(intermediates)/vulkan/anv_entrypoints.h: $(intermediates)/vulkan/dummy.c \ + $(ANV_ENTRYPOINTS_GEN_SCRIPT) \ + $(ANV_EXTENSIONS_SCRIPT) \ + $(VULKAN_API_XML) + $(MESA_PYTHON2) $(ANV_ENTRYPOINTS_GEN_SCRIPT) \ --outdir $(dir $@) \ - --xml $(MESA_TOP)/src/vulkan/registry/vk.xml + --xml $(VULKAN_API_XML) + +$(intermediates)/vulkan/anv_extensions.h: $(ANV_ENTRYPOINTS_GEN_SCRIPT) \ + $(ANV_EXTENSIONS_SCRIPT) \ + $(VULKAN_API_XML) + @mkdir -p $(dir $@) + $(MESA_PYTHON2) $(ANV_EXTENSIONS_GEN_SCRIPT) \ + --xml $(VULKAN_API_XML) \ + --out-h $@ LOCAL_EXPORT_C_INCLUDE_DIRS := \ $(intermediates) @@ -107,6 +132,7 @@ LOCAL_C_INCLUDES := $(ANV_INCLUDES) LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_anv_entrypoints libmesa_genxml LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES) +LOCAL_HEADER_LIBRARIES += $(VULKAN_COMMON_HEADER_LIBRARIES) include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) @@ -127,6 +153,7 @@ LOCAL_C_INCLUDES := $(ANV_INCLUDES) LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_anv_entrypoints libmesa_genxml LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES) +LOCAL_HEADER_LIBRARIES += $(VULKAN_COMMON_HEADER_LIBRARIES) include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) @@ -147,6 +174,7 @@ LOCAL_C_INCLUDES := $(ANV_INCLUDES) LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_anv_entrypoints libmesa_genxml LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES) +LOCAL_HEADER_LIBRARIES += $(VULKAN_COMMON_HEADER_LIBRARIES) include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) @@ -167,6 +195,7 @@ LOCAL_C_INCLUDES := $(ANV_INCLUDES) LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_anv_entrypoints libmesa_genxml LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES) +LOCAL_HEADER_LIBRARIES += $(VULKAN_COMMON_HEADER_LIBRARIES) include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) @@ -187,6 +216,7 @@ LOCAL_C_INCLUDES := $(ANV_INCLUDES) LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_anv_entrypoints libmesa_genxml LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES) +LOCAL_HEADER_LIBRARIES += $(VULKAN_COMMON_HEADER_LIBRARIES) include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) @@ -207,6 +237,7 @@ LOCAL_C_INCLUDES := $(ANV_INCLUDES) LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_anv_entrypoints libmesa_genxml LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES) +LOCAL_HEADER_LIBRARIES += $(VULKAN_COMMON_HEADER_LIBRARIES) include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) @@ -218,7 +249,7 @@ include $(BUILD_STATIC_LIBRARY) include $(CLEAR_VARS) LOCAL_MODULE := libmesa_vulkan_common LOCAL_MODULE_CLASS := STATIC_LIBRARIES - +LOCAL_CFLAGS += -Wno-error intermediates := $(call local-generated-sources-dir) LOCAL_SRC_FILES := $(VULKAN_FILES) @@ -240,27 +271,25 @@ LOCAL_WHOLE_STATIC_LIBRARIES := \ LOCAL_GENERATED_SOURCES += $(intermediates)/vulkan/anv_entrypoints.c LOCAL_GENERATED_SOURCES += $(intermediates)/vulkan/anv_extensions.c -LOCAL_GENERATED_SOURCES += $(intermediates)/vulkan/anv_extensions.h -$(intermediates)/vulkan/anv_entrypoints.c: +$(intermediates)/vulkan/anv_entrypoints.c: $(ANV_ENTRYPOINTS_GEN_SCRIPT) \ + $(ANV_EXTENSIONS_SCRIPT) \ + $(VULKAN_API_XML) @mkdir -p $(dir $@) - $(VK_ENTRYPOINTS_SCRIPT) \ - --xml $(MESA_TOP)/src/vulkan/registry/vk.xml \ + $(MESA_PYTHON2) $(ANV_ENTRYPOINTS_GEN_SCRIPT) \ + --xml $(VULKAN_API_XML) \ --outdir $(dir $@) -$(intermediates)/vulkan/anv_extensions.c: +$(intermediates)/vulkan/anv_extensions.c: $(ANV_EXTENSIONS_GEN_SCRIPT) \ + $(ANV_EXTENSIONS_SCRIPT) \ + $(VULKAN_API_XML) @mkdir -p $(dir $@) - $(VK_EXTENSIONS_SCRIPT) \ - --xml $(MESA_TOP)/src/vulkan/registry/vk.xml \ + $(MESA_PYTHON2) $(ANV_EXTENSIONS_GEN_SCRIPT) \ + --xml $(VULKAN_API_XML) \ --out-c $@ -$(intermediates)/vulkan/anv_extensions.h: - @mkdir -p $(dir $@) - $(VK_EXTENSIONS_SCRIPT) \ - --xml $(MESA_TOP)/src/vulkan/registry/vk.xml \ - --out-h $@ - LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES) +LOCAL_HEADER_LIBRARIES += $(VULKAN_COMMON_HEADER_LIBRARIES) include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) @@ -310,6 +339,16 @@ LOCAL_WHOLE_STATIC_LIBRARIES := \ libmesa_anv_entrypoints LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES) libz libsync liblog +LOCAL_HEADER_LIBRARIES += $(VULKAN_COMMON_HEADER_LIBRARIES) + +# If Android version >=8 MESA should static link libexpat else should dynamic link +ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0) +LOCAL_STATIC_LIBRARIES := \ + libexpat +else + LOCAL_SHARED_LIBRARIES += \ + libexpat +endif include $(MESA_COMMON_MK) include $(BUILD_SHARED_LIBRARY) diff --git a/src/intel/Makefile.isl.am b/src/intel/Makefile.isl.am index a6733f3ba8e..dcb9d3ad6fc 100644 --- a/src/intel/Makefile.isl.am +++ b/src/intel/Makefile.isl.am @@ -33,12 +33,15 @@ ISL_GEN_LIBS = \ noinst_LTLIBRARIES += $(ISL_GEN_LIBS) \ isl/libisl.la \ - libisl_tiled_memcpy.la \ - libisl_tiled_memcpy_sse41.la + libisl_tiled_memcpy.la isl_libisl_la_LIBADD = $(ISL_GEN_LIBS) \ - libisl_tiled_memcpy.la \ - libisl_tiled_memcpy_sse41.la + libisl_tiled_memcpy.la + +if SSE41_SUPPORTED +isl_libisl_la_LIBADD += libisl_tiled_memcpy_sse41.la +noinst_LTLIBRARIES += libisl_tiled_memcpy_sse41.la +endif isl_libisl_la_SOURCES = $(ISL_FILES) $(ISL_GENERATED_FILES) diff --git a/src/intel/Makefile.vulkan.am b/src/intel/Makefile.vulkan.am index b315f10a01a..cad0a57bc7f 100644 --- a/src/intel/Makefile.vulkan.am +++ b/src/intel/Makefile.vulkan.am @@ -253,6 +253,7 @@ VULKAN_TESTS = \ vulkan/tests/block_pool_no_free \ vulkan/tests/state_pool_no_free \ vulkan/tests/state_pool_free_list_only \ + vulkan/tests/state_pool_padding \ vulkan/tests/state_pool VULKAN_TEST_LDADD = \ @@ -274,6 +275,10 @@ vulkan_tests_state_pool_free_list_only_CFLAGS = $(VULKAN_CFLAGS) vulkan_tests_state_pool_free_list_only_CPPFLAGS = $(VULKAN_CPPFLAGS) vulkan_tests_state_pool_free_list_only_LDADD = $(VULKAN_TEST_LDADD) +vulkan_tests_state_pool_padding_CFLAGS = $(VULKAN_CFLAGS) +vulkan_tests_state_pool_padding_CPPFLAGS = $(VULKAN_CPPFLAGS) +vulkan_tests_state_pool_padding_LDADD = $(VULKAN_TEST_LDADD) + vulkan_tests_state_pool_CFLAGS = $(VULKAN_CFLAGS) vulkan_tests_state_pool_CPPFLAGS = $(VULKAN_CPPFLAGS) vulkan_tests_state_pool_LDADD = $(VULKAN_TEST_LDADD) diff --git a/src/intel/blorp/meson.build b/src/intel/blorp/meson.build index c1201b0aa16..ff68d255164 100644 --- a/src/intel/blorp/meson.build +++ b/src/intel/blorp/meson.build @@ -33,5 +33,5 @@ libblorp = static_library( files_libblorp, include_directories : [inc_common, inc_intel], c_args : [c_vis_args, no_override_init_args], - dependencies : idep_nir_headers, + dependencies : [idep_nir_headers, idep_genxml], ) diff --git a/src/intel/common/gen_debug.c b/src/intel/common/gen_debug.c index a978f2f5818..8990d208207 100644 --- a/src/intel/common/gen_debug.c +++ b/src/intel/common/gen_debug.c @@ -85,6 +85,7 @@ static const struct debug_control debug_control[] = { { "nohiz", DEBUG_NO_HIZ }, { "color", DEBUG_COLOR }, { "reemit", DEBUG_REEMIT }, + { "heur32", DEBUG_HEUR32 }, { NULL, 0 } }; diff --git a/src/intel/common/gen_debug.h b/src/intel/common/gen_debug.h index 72d7ca20a39..c2ca2e2ebd6 100644 --- a/src/intel/common/gen_debug.h +++ b/src/intel/common/gen_debug.h @@ -83,6 +83,7 @@ extern uint64_t INTEL_DEBUG; #define DEBUG_NO_HIZ (1ull << 39) #define DEBUG_COLOR (1ull << 40) #define DEBUG_REEMIT (1ull << 41) +#define DEBUG_HEUR32 (1ull << 42) /* These flags are not compatible with the disk shader cache */ #define DEBUG_DISK_CACHE_DISABLE_MASK DEBUG_SHADER_TIME @@ -90,7 +91,7 @@ extern uint64_t INTEL_DEBUG; /* These flags may affect program generation */ #define DEBUG_DISK_CACHE_MASK \ (DEBUG_NO16 | DEBUG_NO_DUAL_OBJECT_GS | DEBUG_NO8 | DEBUG_SPILL_FS | \ - DEBUG_SPILL_VEC4 | DEBUG_NO_COMPACTION | DEBUG_DO32) + DEBUG_SPILL_VEC4 | DEBUG_NO_COMPACTION | DEBUG_DO32 | DEBUG_HEUR32) #ifdef HAVE_ANDROID_PLATFORM #define LOG_TAG "INTEL-MESA" diff --git a/src/intel/common/meson.build b/src/intel/common/meson.build index 332e978b0ad..ec45962502e 100644 --- a/src/intel/common/meson.build +++ b/src/intel/common/meson.build @@ -43,5 +43,5 @@ libintel_common = static_library( include_directories : [inc_common, inc_intel], c_args : [c_vis_args, no_override_init_args], link_with : [libisl], - dependencies : [dep_expat, dep_libdrm, dep_thread], + dependencies : [dep_expat, dep_libdrm, dep_thread, idep_genxml], ) diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h index 61a4528d372..c294e5c3222 100644 --- a/src/intel/compiler/brw_compiler.h +++ b/src/intel/compiler/brw_compiler.h @@ -38,6 +38,15 @@ struct ra_regs; struct nir_shader; struct brw_program; +struct brw_simd32_heuristics_control { + bool grouped_sends_check; + int max_grouped_sends; + bool inst_count_check; + float inst_count_ratio; + bool mrt_check; + int max_mrts; +}; + struct brw_compiler { const struct gen_device_info *devinfo; @@ -118,6 +127,8 @@ struct brw_compiler { * whether nir_opt_large_constants will be run. */ bool supports_shader_constants; + + struct brw_simd32_heuristics_control simd32_heuristics_control; }; /** @@ -196,6 +207,9 @@ struct brw_sampler_prog_key_data { uint32_t yx_xuxv_image_mask; uint32_t xy_uxvx_image_mask; uint32_t ayuv_image_mask; + + /* Scale factor for each texture. */ + float scale_factors[32]; }; /** diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 8dd3b94fbd5..5b29292d6a0 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -251,6 +251,62 @@ fs_inst::is_send_from_grf() const } } +bool +fs_inst::is_control_source(unsigned arg) const +{ + switch (opcode) { + case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: + case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7: + case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4: + return arg == 0; + + case SHADER_OPCODE_BROADCAST: + case SHADER_OPCODE_SHUFFLE: + case SHADER_OPCODE_QUAD_SWIZZLE: + case FS_OPCODE_INTERPOLATE_AT_SAMPLE: + case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: + case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: + case SHADER_OPCODE_IMAGE_SIZE: + case SHADER_OPCODE_GET_BUFFER_SIZE: + return arg == 1; + + case SHADER_OPCODE_MOV_INDIRECT: + case SHADER_OPCODE_CLUSTER_BROADCAST: + case SHADER_OPCODE_TEX: + case FS_OPCODE_TXB: + case SHADER_OPCODE_TXD: + case SHADER_OPCODE_TXF: + case SHADER_OPCODE_TXF_LZ: + case SHADER_OPCODE_TXF_CMS: + case SHADER_OPCODE_TXF_CMS_W: + case SHADER_OPCODE_TXF_UMS: + case SHADER_OPCODE_TXF_MCS: + case SHADER_OPCODE_TXL: + case SHADER_OPCODE_TXL_LZ: + case SHADER_OPCODE_TXS: + case SHADER_OPCODE_LOD: + case SHADER_OPCODE_TG4: + case SHADER_OPCODE_TG4_OFFSET: + case SHADER_OPCODE_SAMPLEINFO: + case SHADER_OPCODE_UNTYPED_ATOMIC: + case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT: + case SHADER_OPCODE_UNTYPED_SURFACE_READ: + case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: + case SHADER_OPCODE_BYTE_SCATTERED_READ: + case SHADER_OPCODE_BYTE_SCATTERED_WRITE: + case SHADER_OPCODE_TYPED_ATOMIC: + case SHADER_OPCODE_TYPED_SURFACE_READ: + case SHADER_OPCODE_TYPED_SURFACE_WRITE: + return arg == 1 || arg == 2; + + case SHADER_OPCODE_SEND: + return arg == 0 || arg == 1; + + default: + return false; + } +} + /** * Returns true if this instruction's sources and destinations cannot * safely be the same register. @@ -3061,6 +3117,7 @@ fs_visitor::opt_peephole_csel() if (csel_inst != NULL) { progress = true; + csel_inst->saturate = inst->saturate; inst->remove(block); } @@ -3899,18 +3956,22 @@ fs_visitor::lower_integer_multiplication() bool needs_mov = false; fs_reg orig_dst = inst->dst; + + /* Get a new VGRF for the "low" 32x16-bit multiplication result if + * reusing the original destination is impossible due to hardware + * restrictions, source/destination overlap, or it being the null + * register. + */ fs_reg low = inst->dst; if (orig_dst.is_null() || orig_dst.file == MRF || regions_overlap(inst->dst, inst->size_written, inst->src[0], inst->size_read(0)) || regions_overlap(inst->dst, inst->size_written, - inst->src[1], inst->size_read(1))) { + inst->src[1], inst->size_read(1)) || + inst->dst.stride >= 4) { needs_mov = true; - /* Get a new VGRF but keep the same stride as inst->dst */ low = fs_reg(VGRF, alloc.allocate(regs_written(inst)), inst->dst.type); - low.stride = inst->dst.stride; - low.offset = inst->dst.offset % REG_SIZE; } /* Get a new VGRF but keep the same stride as inst->dst */ @@ -7542,6 +7603,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, char **error_str) { const struct gen_device_info *devinfo = compiler->devinfo; + bool simd16_failed = false; + bool simd16_spilled = false; shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true); brw_nir_lower_fs_inputs(shader, devinfo, key); @@ -7608,10 +7671,12 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, shader_time_index16); v16.import_uniforms(&v8); if (!v16.run_fs(allow_spilling, use_rep_send)) { + simd16_failed = true; compiler->shader_perf_log(log_data, "SIMD16 shader failed to compile: %s", v16.fail_msg); } else { + simd16_spilled = v16.spilled_any_registers; simd16_cfg = v16.cfg; prog_data->dispatch_grf_start_reg_16 = v16.payload.num_regs; prog_data->reg_blocks_16 = brw_register_blocks(v16.grf_used); @@ -7619,9 +7684,17 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, } /* Currently, the compiler only supports SIMD32 on SNB+ */ + const brw_simd32_heuristics_control *ctrl = &compiler->simd32_heuristics_control; + uint64_t mrts = shader->info.outputs_written << FRAG_RESULT_DATA0; + if (v8.max_dispatch_width >= 32 && !use_rep_send && compiler->devinfo->gen >= 6 && - unlikely(INTEL_DEBUG & DEBUG_DO32)) { + (unlikely(INTEL_DEBUG & DEBUG_DO32) || + (unlikely(INTEL_DEBUG & DEBUG_HEUR32) && + !simd16_failed && !simd16_spilled && + (!ctrl->mrt_check || + (ctrl->mrt_check && + u_count_bits64(&mrts) <= ctrl->max_mrts))))) { /* Try a SIMD32 compile */ fs_visitor v32(compiler, log_data, mem_ctx, key, &prog_data->base, prog, shader, 32, @@ -7632,9 +7705,12 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, "SIMD32 shader failed to compile: %s", v32.fail_msg); } else { - simd32_cfg = v32.cfg; - prog_data->dispatch_grf_start_reg_32 = v32.payload.num_regs; - prog_data->reg_blocks_32 = brw_register_blocks(v32.grf_used); + if (likely(!(INTEL_DEBUG & DEBUG_HEUR32)) || + v32.run_heuristic(ctrl)) { + simd32_cfg = v32.cfg; + prog_data->dispatch_grf_start_reg_32 = v32.payload.num_regs; + prog_data->reg_blocks_32 = brw_register_blocks(v32.grf_used); + } } } @@ -7713,13 +7789,49 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, } if (simd32_cfg) { - prog_data->dispatch_32 = true; - prog_data->prog_offset_32 = g.generate_code(simd32_cfg, 32); + uint32_t offset = g.generate_code(simd32_cfg, 32); + + if (unlikely(INTEL_DEBUG & DEBUG_DO32) || + (unlikely(INTEL_DEBUG & DEBUG_HEUR32) && + (!simd16_cfg || + (simd16_cfg && + (!ctrl->inst_count_check || + (ctrl->inst_count_check && + (float)g.get_inst_count(32) / (float)g.get_inst_count(16) <= ctrl->inst_count_ratio)))))) { + prog_data->dispatch_32 = true; + prog_data->prog_offset_32 = offset; + } } return g.get_assembly(); } +bool +fs_visitor::run_heuristic(const struct brw_simd32_heuristics_control *ctrl) { + int grouped_sends = 0; + int max_grouped_sends = 0; + bool pass = true; + + foreach_block_and_inst(block, fs_inst, inst, cfg) { + if (inst->opcode >= SHADER_OPCODE_TEX && inst->opcode <= SHADER_OPCODE_SAMPLEINFO_LOGICAL) { + ++grouped_sends; + } else if (grouped_sends > 0) { + if (grouped_sends > max_grouped_sends) { + max_grouped_sends = grouped_sends; + } + grouped_sends = 0; + } + } + + if (ctrl->grouped_sends_check) { + if (max_grouped_sends > ctrl->max_grouped_sends) { + pass = false; + } + } + + return pass; +} + fs_reg * fs_visitor::emit_cs_work_group_id_setup() { diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index 5361b768003..72acf85581e 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -289,6 +289,8 @@ class fs_visitor : public backend_shader void dump_instruction(backend_instruction *inst); void dump_instruction(backend_instruction *inst, FILE *file); + bool run_heuristic(const struct brw_simd32_heuristics_control *ctrl); + const void *const key; const struct brw_sampler_prog_key_data *key_tex; @@ -400,6 +402,7 @@ class fs_generator void enable_debug(const char *shader_name); int generate_code(const cfg_t *cfg, int dispatch_width); + int get_inst_count(int dispatch_width); const unsigned *get_assembly(); private: @@ -495,6 +498,7 @@ class fs_generator struct brw_stage_prog_data * const prog_data; unsigned dispatch_width; /**< 8, 16 or 32 */ + int inst_count[3]; /* for 8, 16 and 32 */ exec_list discard_halt_patches; unsigned promoted_constants; diff --git a/src/intel/compiler/brw_fs_cmod_propagation.cpp b/src/intel/compiler/brw_fs_cmod_propagation.cpp index 5fb522f810f..b58730fbbe5 100644 --- a/src/intel/compiler/brw_fs_cmod_propagation.cpp +++ b/src/intel/compiler/brw_fs_cmod_propagation.cpp @@ -255,6 +255,13 @@ opt_cmod_propagation_local(const gen_device_info *devinfo, bblock_t *block) if (inst->opcode == BRW_OPCODE_AND) break; + /* Not safe to use inequality operators if the types are different + */ + if (scan_inst->dst.type != inst->src[0].type && + inst->conditional_mod != BRW_CONDITIONAL_Z && + inst->conditional_mod != BRW_CONDITIONAL_NZ) + break; + /* Comparisons operate differently for ints and floats */ if (scan_inst->dst.type != inst->dst.type && (scan_inst->dst.type == BRW_REGISTER_TYPE_F || diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp b/src/intel/compiler/brw_fs_copy_propagation.cpp index c23ce1ef426..bba7eb35830 100644 --- a/src/intel/compiler/brw_fs_copy_propagation.cpp +++ b/src/intel/compiler/brw_fs_copy_propagation.cpp @@ -32,9 +32,10 @@ * 12.5 (p356). */ -#define ACP_HASH_SIZE 16 +#define ACP_HASH_SIZE 64 #include "util/bitset.h" +#include "util/u_math.h" #include "brw_fs.h" #include "brw_fs_live_variables.h" #include "brw_cfg.h" @@ -46,6 +47,7 @@ namespace { /* avoid conflict with opt_copy_propagation_elements */ struct acp_entry : public exec_node { fs_reg dst; fs_reg src; + unsigned global_idx; uint8_t size_written; uint8_t size_read; enum opcode opcode; @@ -142,6 +144,8 @@ fs_copy_prop_dataflow::fs_copy_prop_dataflow(void *mem_ctx, cfg_t *cfg, foreach_in_list(acp_entry, entry, &out_acp[block->num][i]) { acp[next_acp] = entry; + entry->global_idx = next_acp; + /* opt_copy_propagation_local populates out_acp with copies created * in a block which are still live at the end of the block. This * is exactly what we want in the COPY set. @@ -167,21 +171,74 @@ void fs_copy_prop_dataflow::setup_initial_values() { /* Initialize the COPY and KILL sets. */ - foreach_block (block, cfg) { - foreach_inst_in_block(fs_inst, inst, block) { - if (inst->dst.file != VGRF) - continue; + { + /* Create a temporary table of ACP entries which we'll use for efficient + * look-up. Unfortunately, we have to do this in two steps because we + * have to match both sources and destinations and an ACP entry can only + * be in one list at a time. + * + * We choose to make the table size between num_acp/2 and num_acp/4 to + * try and trade off between the time it takes to initialize the table + * via exec_list constructors or make_empty() and the cost of + * collisions. In practice, it doesn't appear to matter too much what + * size we make the table as long as it's roughly the same order of + * magnitude as num_acp. We get most of the benefit of the table + * approach even if we use a table of size ACP_HASH_SIZE though a + * full-sized table is 1-2% faster in practice. + */ + unsigned acp_table_size = util_next_power_of_two(num_acp) / 4; + acp_table_size = MAX2(acp_table_size, ACP_HASH_SIZE); + exec_list *acp_table = new exec_list[acp_table_size]; - /* Mark ACP entries which are killed by this instruction. */ - for (int i = 0; i < num_acp; i++) { - if (regions_overlap(inst->dst, inst->size_written, - acp[i]->dst, acp[i]->size_written) || - regions_overlap(inst->dst, inst->size_written, - acp[i]->src, acp[i]->size_read)) { - BITSET_SET(bd[block->num].kill, i); + /* First, get all the KILLs for instructions which overwrite ACP + * destinations. + */ + for (int i = 0; i < num_acp; i++) { + unsigned idx = acp[i]->dst.nr & (acp_table_size - 1); + acp_table[idx].push_tail(acp[i]); + } + + foreach_block (block, cfg) { + foreach_inst_in_block(fs_inst, inst, block) { + if (inst->dst.file != VGRF) + continue; + + unsigned idx = inst->dst.nr & (acp_table_size - 1); + foreach_in_list(acp_entry, entry, &acp_table[idx]) { + if (regions_overlap(inst->dst, inst->size_written, + entry->dst, entry->size_written)) + BITSET_SET(bd[block->num].kill, entry->global_idx); } } } + + /* Clear the table for the second pass */ + for (unsigned i = 0; i < acp_table_size; i++) + acp_table[i].make_empty(); + + /* Next, get all the KILLs for instructions which overwrite ACP + * sources. + */ + for (int i = 0; i < num_acp; i++) { + unsigned idx = acp[i]->src.nr & (acp_table_size - 1); + acp_table[idx].push_tail(acp[i]); + } + + foreach_block (block, cfg) { + foreach_inst_in_block(fs_inst, inst, block) { + if (inst->dst.file != VGRF) + continue; + + unsigned idx = inst->dst.nr & (acp_table_size - 1); + foreach_in_list(acp_entry, entry, &acp_table[idx]) { + if (regions_overlap(inst->dst, inst->size_written, + entry->src, entry->size_read)) + BITSET_SET(bd[block->num].kill, entry->global_idx); + } + } + } + + delete [] acp_table; } /* Populate the initial values for the livein and liveout sets. For the @@ -904,6 +961,25 @@ fs_visitor::opt_copy_propagation() foreach_block (block, cfg) { progress = opt_copy_propagation_local(copy_prop_ctx, block, out_acp[block->num]) || progress; + + /* If the destination of an ACP entry exists only within this block, + * then there's no need to keep it for dataflow analysis. We can delete + * it from the out_acp table and avoid growing the bitsets any bigger + * than we absolutely have to. + * + * Because nothing in opt_copy_propagation_local touches the block + * start/end IPs and opt_copy_propagation_local is incapable of + * extending the live range of an ACP destination beyond the block, + * it's safe to use the liveness information in this way. + */ + for (unsigned a = 0; a < ACP_HASH_SIZE; a++) { + foreach_in_list_safe(acp_entry, entry, &out_acp[block->num][a]) { + assert(entry->dst.file == VGRF); + if (block->start_ip <= virtual_grf_start[entry->dst.nr] && + virtual_grf_end[entry->dst.nr] <= block->end_ip) + entry->remove(); + } + } } /* Do dataflow analysis for those available copies. */ diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp index e3b68fa3165..82c2713a77f 100644 --- a/src/intel/compiler/brw_fs_generator.cpp +++ b/src/intel/compiler/brw_fs_generator.cpp @@ -90,9 +90,16 @@ brw_reg_from_fs_reg(const struct gen_device_info *devinfo, fs_inst *inst, * different execution size when the number of components * written to each destination GRF is not the same. */ - const unsigned width = MIN2(reg_width, phys_width); - brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0); - brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride); + if (reg->stride > 4) { + assert(reg != &inst->dst); + assert(reg->stride * type_sz(reg->type) <= REG_SIZE); + brw_reg = brw_vecn_reg(1, brw_file_from_reg(reg), reg->nr, 0); + brw_reg = stride(brw_reg, reg->stride, 1, 0); + } else { + const unsigned width = MIN2(reg_width, phys_width); + brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0); + brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride); + } if (devinfo->gen == 7 && !devinfo->is_haswell) { /* From the IvyBridge PRM (EU Changes by Processor Generation, page 13): @@ -2093,6 +2100,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) break; case SHADER_OPCODE_INTERLOCK: + assert(devinfo->gen >= 9); /* The interlock is basically a memory fence issued via sendc */ brw_memory_fence(p, dst, BRW_OPCODE_SENDC); break; @@ -2289,6 +2297,8 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) fill_count, promoted_constants, before_size, after_size); + inst_count[ffs(dispatch_width) - 4] = before_size / 16; + return start_offset; } @@ -2297,3 +2307,13 @@ fs_generator::get_assembly() { return brw_get_program(p, &prog_data->program_size); } + +int +fs_generator::get_inst_count(int dispatch_width) +{ + if (dispatch_width == 8 || dispatch_width == 16 || dispatch_width == 32) { + return inst_count[ffs(dispatch_width) - 4]; + } else { + return 0; + } +} \ No newline at end of file diff --git a/src/intel/compiler/brw_fs_lower_regioning.cpp b/src/intel/compiler/brw_fs_lower_regioning.cpp index df50993dee6..c60d4700419 100644 --- a/src/intel/compiler/brw_fs_lower_regioning.cpp +++ b/src/intel/compiler/brw_fs_lower_regioning.cpp @@ -71,15 +71,33 @@ namespace { !is_byte_raw_mov(inst)) { return get_exec_type_size(inst); } else { - unsigned stride = inst->dst.stride * type_sz(inst->dst.type); + /* Calculate the maximum byte stride and the minimum/maximum type + * size across all source and destination operands we are required to + * lower. + */ + unsigned max_stride = inst->dst.stride * type_sz(inst->dst.type); + unsigned min_size = type_sz(inst->dst.type); + unsigned max_size = type_sz(inst->dst.type); for (unsigned i = 0; i < inst->sources; i++) { - if (!is_uniform(inst->src[i])) - stride = MAX2(stride, inst->src[i].stride * - type_sz(inst->src[i].type)); + if (!is_uniform(inst->src[i]) && !inst->is_control_source(i)) { + const unsigned size = type_sz(inst->src[i].type); + max_stride = MAX2(max_stride, inst->src[i].stride * size); + min_size = MIN2(min_size, size); + max_size = MAX2(max_size, size); + } } - return stride; + /* All operands involved in lowering need to fit in the calculated + * stride. + */ + assert(max_size <= 4 * min_size); + + /* Attempt to use the largest byte stride among all present operands, + * but never exceed a stride of 4 since that would lead to illegal + * destination regions during lowering. + */ + return MIN2(max_stride, 4 * min_size); } } @@ -92,7 +110,7 @@ namespace { required_dst_byte_offset(const fs_inst *inst) { for (unsigned i = 0; i < inst->sources; i++) { - if (!is_uniform(inst->src[i])) + if (!is_uniform(inst->src[i]) && !inst->is_control_source(i)) if (reg_offset(inst->src[i]) % REG_SIZE != reg_offset(inst->dst) % REG_SIZE) return 0; @@ -109,7 +127,7 @@ namespace { has_invalid_src_region(const gen_device_info *devinfo, const fs_inst *inst, unsigned i) { - if (is_unordered(inst)) { + if (is_unordered(inst) || inst->is_control_source(i)) { return false; } else { const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type); diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index f16627b8a64..6f0d9731cfe 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -512,6 +512,15 @@ fs_visitor::optimize_extract_to_float(nir_alu_instr *instr, src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16) return false; + /* If either opcode has source modifiers, bail. + * + * TODO: We can potentially handle source modifiers if both of the opcodes + * we're combining are signed integers. + */ + if (instr->src[0].abs || instr->src[0].negate || + src0->src[0].abs || src0->src[0].negate) + return false; + unsigned element = nir_src_as_uint(src0->src[1].src); /* Element type to extract.*/ @@ -1484,16 +1493,25 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) * Use two instructions and a word or DWord intermediate integer type. */ if (nir_dest_bit_size(instr->dest.dest) == 64) { - const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i8); + const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8); if (instr->op == nir_op_extract_i8) { /* If we need to sign extend, extract to a word first */ fs_reg w_temp = bld.vgrf(BRW_REGISTER_TYPE_W); bld.MOV(w_temp, subscript(op[0], type, byte)); bld.MOV(result, w_temp); + } else if (byte & 1) { + /* Extract the high byte from the word containing the desired byte + * offset. + */ + bld.SHR(result, + subscript(op[0], BRW_REGISTER_TYPE_UW, byte / 2), + brw_imm_uw(8)); } else { /* Otherwise use an AND with 0xff and a word type */ - bld.AND(result, subscript(op[0], type, byte / 2), brw_imm_uw(0xff)); + bld.AND(result, + subscript(op[0], BRW_REGISTER_TYPE_UW, byte / 2), + brw_imm_uw(0xff)); } } else { const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8); diff --git a/src/intel/compiler/brw_fs_reg_allocate.cpp b/src/intel/compiler/brw_fs_reg_allocate.cpp index 6961cb1caf4..6e18bdfe68a 100644 --- a/src/intel/compiler/brw_fs_reg_allocate.cpp +++ b/src/intel/compiler/brw_fs_reg_allocate.cpp @@ -591,7 +591,7 @@ fs_visitor::assign_regs(bool allow_spilling, bool spill_all) */ foreach_block_and_inst(block, fs_inst, inst, cfg) { if (inst->dst.file == VGRF && inst->has_source_and_destination_hazard()) { - for (unsigned i = 0; i < 3; i++) { + for (unsigned i = 0; i < inst->sources; i++) { if (inst->src[i].file == VGRF) { ra_add_node_interference(g, inst->dst.nr, inst->src[i].nr); } @@ -667,15 +667,14 @@ fs_visitor::assign_regs(bool allow_spilling, bool spill_all) * messages adding a node interference to the grf127_send_hack_node. * This node has a fixed asignment to grf127. * - * We don't apply it to SIMD16 because previous code avoids any register - * overlap between sources and destination. + * We don't apply it to SIMD16 instructions because previous code avoids + * any register overlap between sources and destination. */ ra_set_node_reg(g, grf127_send_hack_node, 127); - if (dispatch_width == 8) { - foreach_block_and_inst(block, fs_inst, inst, cfg) { - if (inst->is_send_from_grf() && inst->dst.file == VGRF) - ra_add_node_interference(g, inst->dst.nr, grf127_send_hack_node); - } + foreach_block_and_inst(block, fs_inst, inst, cfg) { + if (inst->exec_size < 16 && inst->is_send_from_grf() && + inst->dst.file == VGRF) + ra_add_node_interference(g, inst->dst.nr, grf127_send_hack_node); } if (spilled_any_registers) { @@ -711,14 +710,9 @@ fs_visitor::assign_regs(bool allow_spilling, bool spill_all) if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 && inst->src[2].file == VGRF && inst->src[3].file == VGRF && - inst->src[2].nr != inst->src[3].nr) { - for (unsigned i = 0; i < inst->mlen; i++) { - for (unsigned j = 0; j < inst->ex_mlen; j++) { - ra_add_node_interference(g, inst->src[2].nr + i, - inst->src[3].nr + j); - } - } - } + inst->src[2].nr != inst->src[3].nr) + ra_add_node_interference(g, inst->src[2].nr, + inst->src[3].nr); } } diff --git a/src/intel/compiler/brw_ir_fs.h b/src/intel/compiler/brw_ir_fs.h index d05357e822e..c4427a658b0 100644 --- a/src/intel/compiler/brw_ir_fs.h +++ b/src/intel/compiler/brw_ir_fs.h @@ -357,6 +357,13 @@ class fs_inst : public backend_instruction { bool can_change_types() const; bool has_source_and_destination_hazard() const; + /** + * Return whether \p arg is a control source of a virtual instruction which + * shouldn't contribute to the execution type and usual regioning + * restriction calculations of arithmetic instructions. + */ + bool is_control_source(unsigned arg) const; + /** * Return the subset of flag registers read by the instruction as a bitset * with byte granularity. @@ -461,7 +468,8 @@ get_exec_type(const fs_inst *inst) brw_reg_type exec_type = BRW_REGISTER_TYPE_B; for (int i = 0; i < inst->sources; i++) { - if (inst->src[i].file != BAD_FILE) { + if (inst->src[i].file != BAD_FILE && + !inst->is_control_source(i)) { const brw_reg_type t = get_exec_type(inst->src[i].type); if (type_sz(t) > type_sz(exec_type)) exec_type = t; diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index 9dbf06004a4..6b8f4d30c1a 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -590,9 +590,9 @@ brw_nir_optimize(nir_shader *nir, const struct brw_compiler *compiler, const bool is_vec4_tessellation = !is_scalar && (nir->info.stage == MESA_SHADER_TESS_CTRL || nir->info.stage == MESA_SHADER_TESS_EVAL); - OPT(nir_opt_peephole_select, 0, !is_vec4_tessellation, false); - OPT(nir_opt_peephole_select, 1, !is_vec4_tessellation, - compiler->devinfo->gen >= 6); + OPT(nir_opt_peephole_select, 0, !is_vec4_tessellation); + if (compiler->devinfo->gen >= 6) + OPT(nir_opt_peephole_select, 1, !is_vec4_tessellation); OPT(nir_opt_intrinsics); OPT(nir_opt_idiv_const, 32); @@ -794,6 +794,17 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir) OPT(brw_nir_lower_mem_access_bit_sizes); + /* Lower array derefs of vectors for SSBO and UBO loads. For both UBOs and + * SSBOs, our back-end is capable of loading an entire vec4 at a time and + * we would like to take advantage of that whenever possible regardless of + * whether or not the app gives us full loads. This should allow the + * optimizer to combine UBO and SSBO load operations and save us some send + * messages. + */ + OPT(nir_lower_array_deref_of_vec, + nir_var_mem_ubo | nir_var_mem_ssbo, + nir_lower_direct_array_deref_of_vec_load); + /* Get rid of split copies */ nir = brw_nir_optimize(nir, compiler, is_scalar, false); @@ -842,6 +853,23 @@ brw_nir_link_shaders(const struct brw_compiler *compiler, *producer = brw_nir_optimize(*producer, compiler, p_is_scalar, false); *consumer = brw_nir_optimize(*consumer, compiler, c_is_scalar, false); } + + NIR_PASS_V(*producer, nir_lower_io_to_vector, nir_var_shader_out); + NIR_PASS_V(*consumer, nir_lower_io_to_vector, nir_var_shader_in); + + if ((*producer)->info.stage != MESA_SHADER_TESS_CTRL) { + /* Calling lower_io_to_vector creates output variable writes with + * write-masks. On non-TCS outputs, the back-end can't handle it and we + * need to call nir_lower_io_to_temporaries to get rid of them. This, + * in turn, creates temporary variables and extra copy_deref intrinsics + * that we need to clean up. + */ + NIR_PASS_V(*producer, nir_lower_io_to_temporaries, + nir_shader_get_entrypoint(*producer), true, false); + NIR_PASS_V(*producer, nir_lower_global_vars_to_local); + NIR_PASS_V(*producer, nir_split_var_copies); + NIR_PASS_V(*producer, nir_lower_var_copies); + } } /* Prepare the given shader for codegen @@ -932,7 +960,9 @@ brw_nir_apply_sampler_key(nir_shader *nir, bool is_scalar) { const struct gen_device_info *devinfo = compiler->devinfo; - nir_lower_tex_options tex_options = { 0 }; + nir_lower_tex_options tex_options = { + .lower_txd_clamp_if_sampler_index_not_lt_16 = true, + }; /* Iron Lake and prior require lowering of all rectangle textures */ if (devinfo->gen < 6) @@ -964,6 +994,10 @@ brw_nir_apply_sampler_key(nir_shader *nir, tex_options.lower_xy_uxvx_external = key_tex->xy_uxvx_image_mask; tex_options.lower_ayuv_external = key_tex->ayuv_image_mask; + /* Setup array of scaling factors for each texture. */ + memcpy(&tex_options.scale_factors, &key_tex->scale_factors, + sizeof(tex_options.scale_factors)); + if (nir_lower_tex(nir, &tex_options)) { nir_validate_shader(nir, "after nir_lower_tex"); nir = brw_nir_optimize(nir, compiler, is_scalar, false); diff --git a/src/intel/compiler/brw_vec4.cpp b/src/intel/compiler/brw_vec4.cpp index 4489c682d01..785508f1e3f 100644 --- a/src/intel/compiler/brw_vec4.cpp +++ b/src/intel/compiler/brw_vec4.cpp @@ -1160,6 +1160,12 @@ vec4_instruction::can_reswizzle(const struct gen_device_info *devinfo, if (devinfo->gen == 6 && is_math() && swizzle != BRW_SWIZZLE_XYZW) return false; + /* If we write to the flag register changing the swizzle would change + * what channels are written to the flag register. + */ + if (writes_flag()) + return false; + /* We can't swizzle implicit accumulator access. We'd have to * reswizzle the producer of the accumulator value in addition * to the consumer (i.e. both MUL and MACH). Just skip this. diff --git a/src/intel/compiler/brw_vec4_cmod_propagation.cpp b/src/intel/compiler/brw_vec4_cmod_propagation.cpp index 760327d559d..a7a3bb8fb06 100644 --- a/src/intel/compiler/brw_vec4_cmod_propagation.cpp +++ b/src/intel/compiler/brw_vec4_cmod_propagation.cpp @@ -173,19 +173,19 @@ opt_cmod_propagation_local(bblock_t *block, vec4_visitor *v) /* Given a sequence like: * - * cmp.ge.f0(8) g21<1>.xF g20<4>.xF g18<4>.xF + * cmp.ge.f0(8) g21<1>.zF g20<4>.xF g18<4>.xF * ... - * cmp.nz.f0(8) null<1>D g21<4>.xD 0D + * cmp.nz.f0(8) null<1>D g21<4>.zD 0D * * Replace it with something like: * - * cmp.ge.f0(8) g22<1>F g20<4>.xF g18<4>.xF - * mov(8) g21<1>.xF g22<1>.xxxxF + * cmp.ge.f0(8) g22<1>.zF g20<4>.xF g18<4>.xF + * mov(8) g21<1>.xF g22<1>.zzzzF * * The added MOV will most likely be removed later. In the * worst case, it should be cheaper to schedule. */ - temp.swizzle = inst->src[0].swizzle; + temp.swizzle = brw_swizzle_for_mask(inst->dst.writemask); temp.type = scan_inst->src[0].type; vec4_instruction *mov = v->MOV(scan_inst->dst, temp); diff --git a/src/intel/compiler/test_fs_cmod_propagation.cpp b/src/intel/compiler/test_fs_cmod_propagation.cpp index 659fbb2d1bc..4215af1fb02 100644 --- a/src/intel/compiler/test_fs_cmod_propagation.cpp +++ b/src/intel/compiler/test_fs_cmod_propagation.cpp @@ -889,3 +889,35 @@ TEST_F(cmod_propagation_test, subtract_delete_compare_derp) EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 1)->opcode); EXPECT_EQ(BRW_PREDICATE_NORMAL, instruction(block0, 1)->predicate); } + +TEST_F(cmod_propagation_test, signed_unsigned_comparison_mismatch) +{ + const fs_builder &bld = v->bld; + fs_reg dest0 = v->vgrf(glsl_type::int_type); + fs_reg src0 = v->vgrf(glsl_type::int_type); + src0.type = BRW_REGISTER_TYPE_W; + + bld.ASR(dest0, negate(src0), brw_imm_d(15)); + bld.CMP(bld.null_reg_ud(), retype(dest0, BRW_REGISTER_TYPE_UD), + brw_imm_ud(0u), BRW_CONDITIONAL_LE); + + /* = Before = + * 0: asr(8) dest:D -src0:W 15D + * 1: cmp.le.f0(8) null:UD dest:UD 0UD + * + * = After = + * (no changes) + */ + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_FALSE(cmod_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_ASR, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_LE, instruction(block0, 1)->conditional_mod); +} diff --git a/src/intel/dev/gen_device_info.c b/src/intel/dev/gen_device_info.c index 5dbd0607572..625ebc031dc 100644 --- a/src/intel/dev/gen_device_info.c +++ b/src/intel/dev/gen_device_info.c @@ -414,6 +414,7 @@ static const struct gen_device_info gen_device_info_hsw_gt3 = { .has_64bit_types = true, \ .supports_simd16_3src = true, \ .has_surface_tile_offset = true, \ + .num_thread_per_eu = 7, \ .max_vs_threads = 504, \ .max_tcs_threads = 504, \ .max_tes_threads = 504, \ @@ -427,7 +428,6 @@ static const struct gen_device_info gen_device_info_bdw_gt1 = { .num_slices = 1, .num_subslices = { 2, }, .num_eu_per_subslice = 8, - .num_thread_per_eu = 7, .l3_banks = 2, .max_cs_threads = 42, .urb = { @@ -452,7 +452,6 @@ static const struct gen_device_info gen_device_info_bdw_gt2 = { .num_slices = 1, .num_subslices = { 3, }, .num_eu_per_subslice = 8, - .num_thread_per_eu = 7, .l3_banks = 4, .max_cs_threads = 56, .urb = { @@ -477,7 +476,6 @@ static const struct gen_device_info gen_device_info_bdw_gt3 = { .num_slices = 2, .num_subslices = { 3, 3, }, .num_eu_per_subslice = 8, - .num_thread_per_eu = 7, .l3_banks = 8, .max_cs_threads = 56, .urb = { @@ -503,7 +501,6 @@ static const struct gen_device_info gen_device_info_chv = { .num_slices = 1, .num_subslices = { 2, }, .num_eu_per_subslice = 8, - .num_thread_per_eu = 7, .l3_banks = 2, .max_vs_threads = 80, .max_tcs_threads = 80, @@ -609,8 +606,7 @@ static const struct gen_device_info gen_device_info_chv = { #define GEN9_FEATURES \ GEN8_FEATURES, \ GEN9_HW_INFO, \ - .has_sample_with_hiz = true, \ - .num_thread_per_eu = 7 + .has_sample_with_hiz = true static const struct gen_device_info gen_device_info_skl_gt1 = { GEN9_FEATURES, .gt = 1, @@ -777,6 +773,7 @@ static const struct gen_device_info gen_device_info_cfl_gt1 = { .num_subslices = { 2, }, .num_eu_per_subslice = 6, .l3_banks = 2, + .urb.size = 192, .simulator_id = 24, }; static const struct gen_device_info gen_device_info_cfl_gt2 = { diff --git a/src/intel/genxml/gen10.xml b/src/intel/genxml/gen10.xml index 284633aedd4..4cb1f05ae25 100644 --- a/src/intel/genxml/gen10.xml +++ b/src/intel/genxml/gen10.xml @@ -2043,7 +2043,10 @@ - + + + + diff --git a/src/intel/genxml/gen11.xml b/src/intel/genxml/gen11.xml index 95a84a2f597..a7c06c5ab60 100644 --- a/src/intel/genxml/gen11.xml +++ b/src/intel/genxml/gen11.xml @@ -2063,7 +2063,10 @@ - + + + + diff --git a/src/intel/genxml/gen7.xml b/src/intel/genxml/gen7.xml index 363fd8664bf..1b2c7d996f9 100644 --- a/src/intel/genxml/gen7.xml +++ b/src/intel/genxml/gen7.xml @@ -1399,7 +1399,10 @@ - + + + + diff --git a/src/intel/genxml/gen75.xml b/src/intel/genxml/gen75.xml index a1da9cae041..95b306139eb 100644 --- a/src/intel/genxml/gen75.xml +++ b/src/intel/genxml/gen75.xml @@ -1713,7 +1713,10 @@ - + + + + diff --git a/src/intel/genxml/gen8.xml b/src/intel/genxml/gen8.xml index 4676d9bca9c..0226d7c0c66 100644 --- a/src/intel/genxml/gen8.xml +++ b/src/intel/genxml/gen8.xml @@ -1816,7 +1816,10 @@ - + + + + diff --git a/src/intel/genxml/gen9.xml b/src/intel/genxml/gen9.xml index 8afa986df55..88fc2da7885 100644 --- a/src/intel/genxml/gen9.xml +++ b/src/intel/genxml/gen9.xml @@ -1995,7 +1995,10 @@ - + + + + diff --git a/src/intel/genxml/meson.build b/src/intel/genxml/meson.build index d0c982d0f8b..343b4fcc45f 100644 --- a/src/intel/genxml/meson.build +++ b/src/intel/genxml/meson.build @@ -57,3 +57,5 @@ foreach f : gen_xml_files capture : true, ) endforeach + +idep_genxml = declare_dependency(sources : [gen_xml_pack, genX_bits_h, genX_xml_h]) diff --git a/src/intel/meson.build b/src/intel/meson.build index 3c57e79d325..a5bb03e314a 100644 --- a/src/intel/meson.build +++ b/src/intel/meson.build @@ -21,9 +21,9 @@ c_sse2_args = ['-msse2', '-mstackrealign'] inc_intel = include_directories('.') +subdir('genxml') subdir('blorp') subdir('dev') -subdir('genxml') subdir('isl') subdir('common') subdir('compiler') diff --git a/src/intel/vulkan/anv_allocator.c b/src/intel/vulkan/anv_allocator.c index 006175c8c65..e9cc5764924 100644 --- a/src/intel/vulkan/anv_allocator.c +++ b/src/intel/vulkan/anv_allocator.c @@ -165,7 +165,7 @@ anv_state_table_init(struct anv_state_table *table, goto fail_fd; } - if (!u_vector_init(&table->mmap_cleanups, + if (!u_vector_init(&table->cleanups, round_to_power_of_two(sizeof(struct anv_state_table_cleanup)), 128)) { result = vk_error(VK_ERROR_INITIALIZATION_FAILED); @@ -179,12 +179,12 @@ anv_state_table_init(struct anv_state_table *table, uint32_t initial_size = initial_entries * ANV_STATE_ENTRY_SIZE; result = anv_state_table_expand_range(table, initial_size); if (result != VK_SUCCESS) - goto fail_mmap_cleanups; + goto fail_cleanups; return VK_SUCCESS; - fail_mmap_cleanups: - u_vector_finish(&table->mmap_cleanups); + fail_cleanups: + u_vector_finish(&table->cleanups); fail_fd: close(table->fd); @@ -195,7 +195,7 @@ static VkResult anv_state_table_expand_range(struct anv_state_table *table, uint32_t size) { void *map; - struct anv_mmap_cleanup *cleanup; + struct anv_state_table_cleanup *cleanup; /* Assert that we only ever grow the pool */ assert(size >= table->state.end); @@ -204,11 +204,11 @@ anv_state_table_expand_range(struct anv_state_table *table, uint32_t size) if (size > BLOCK_POOL_MEMFD_SIZE) return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - cleanup = u_vector_add(&table->mmap_cleanups); + cleanup = u_vector_add(&table->cleanups); if (!cleanup) return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - *cleanup = ANV_MMAP_CLEANUP_INIT; + *cleanup = ANV_STATE_TABLE_CLEANUP_INIT; /* Just leak the old map until we destroy the pool. We can't munmap it * without races or imposing locking on the block allocate fast path. On @@ -272,12 +272,12 @@ anv_state_table_finish(struct anv_state_table *table) { struct anv_state_table_cleanup *cleanup; - u_vector_foreach(cleanup, &table->mmap_cleanups) { + u_vector_foreach(cleanup, &table->cleanups) { if (cleanup->map) munmap(cleanup->map, cleanup->size); } - u_vector_finish(&table->mmap_cleanups); + u_vector_finish(&table->cleanups); close(table->fd); } diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c index 53303e0e745..60d332c33b6 100644 --- a/src/intel/vulkan/anv_cmd_buffer.c +++ b/src/intel/vulkan/anv_cmd_buffer.c @@ -128,8 +128,13 @@ static void anv_cmd_pipeline_state_finish(struct anv_cmd_buffer *cmd_buffer, struct anv_cmd_pipeline_state *pipe_state) { - for (uint32_t i = 0; i < ARRAY_SIZE(pipe_state->push_descriptors); i++) - vk_free(&cmd_buffer->pool->alloc, pipe_state->push_descriptors[i]); + for (uint32_t i = 0; i < ARRAY_SIZE(pipe_state->push_descriptors); i++) { + if (pipe_state->push_descriptors[i]) { + anv_descriptor_set_layout_unref(cmd_buffer->device, + pipe_state->push_descriptors[i]->set.layout); + vk_free(&cmd_buffer->pool->alloc, pipe_state->push_descriptors[i]); + } + } } static void @@ -957,10 +962,11 @@ anv_cmd_buffer_get_depth_stencil_view(const struct anv_cmd_buffer *cmd_buffer) return iview; } -static struct anv_push_descriptor_set * -anv_cmd_buffer_get_push_descriptor_set(struct anv_cmd_buffer *cmd_buffer, - VkPipelineBindPoint bind_point, - uint32_t set) +static struct anv_descriptor_set * +anv_cmd_buffer_push_descriptor_set(struct anv_cmd_buffer *cmd_buffer, + VkPipelineBindPoint bind_point, + struct anv_descriptor_set_layout *layout, + uint32_t _set) { struct anv_cmd_pipeline_state *pipe_state; if (bind_point == VK_PIPELINE_BIND_POINT_COMPUTE) { @@ -971,19 +977,31 @@ anv_cmd_buffer_get_push_descriptor_set(struct anv_cmd_buffer *cmd_buffer, } struct anv_push_descriptor_set **push_set = - &pipe_state->push_descriptors[set]; + &pipe_state->push_descriptors[_set]; if (*push_set == NULL) { - *push_set = vk_alloc(&cmd_buffer->pool->alloc, - sizeof(struct anv_push_descriptor_set), 8, - VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + *push_set = vk_zalloc(&cmd_buffer->pool->alloc, + sizeof(struct anv_push_descriptor_set), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (*push_set == NULL) { anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY); return NULL; } } - return *push_set; + struct anv_descriptor_set *set = &(*push_set)->set; + + if (set->layout != layout) { + if (set->layout) + anv_descriptor_set_layout_unref(cmd_buffer->device, set->layout); + anv_descriptor_set_layout_ref(layout); + set->layout = layout; + } + set->size = anv_descriptor_set_layout_size(layout); + set->buffer_count = layout->buffer_count; + set->buffer_views = (*push_set)->buffer_views; + + return set; } void anv_CmdPushDescriptorSetKHR( @@ -1001,19 +1019,12 @@ void anv_CmdPushDescriptorSetKHR( struct anv_descriptor_set_layout *set_layout = layout->set[_set].layout; - struct anv_push_descriptor_set *push_set = - anv_cmd_buffer_get_push_descriptor_set(cmd_buffer, - pipelineBindPoint, _set); - if (!push_set) + struct anv_descriptor_set *set = + anv_cmd_buffer_push_descriptor_set(cmd_buffer, pipelineBindPoint, + set_layout, _set); + if (!set) return; - struct anv_descriptor_set *set = &push_set->set; - - set->layout = set_layout; - set->size = anv_descriptor_set_layout_size(set_layout); - set->buffer_count = set_layout->buffer_count; - set->buffer_views = push_set->buffer_views; - /* Go through the user supplied descriptors. */ for (uint32_t i = 0; i < descriptorWriteCount; i++) { const VkWriteDescriptorSet *write = &pDescriptorWrites[i]; @@ -1093,19 +1104,12 @@ void anv_CmdPushDescriptorSetWithTemplateKHR( struct anv_descriptor_set_layout *set_layout = layout->set[_set].layout; - struct anv_push_descriptor_set *push_set = - anv_cmd_buffer_get_push_descriptor_set(cmd_buffer, - template->bind_point, _set); - if (!push_set) + struct anv_descriptor_set *set = + anv_cmd_buffer_push_descriptor_set(cmd_buffer, template->bind_point, + set_layout, _set); + if (!set) return; - struct anv_descriptor_set *set = &push_set->set; - - set->layout = set_layout; - set->size = anv_descriptor_set_layout_size(set_layout); - set->buffer_count = set_layout->buffer_count; - set->buffer_views = push_set->buffer_views; - anv_descriptor_set_write_template(set, cmd_buffer->device, &cmd_buffer->surface_state_stream, diff --git a/src/intel/vulkan/anv_descriptor_set.c b/src/intel/vulkan/anv_descriptor_set.c index a4e466cf3dd..0259abea0bf 100644 --- a/src/intel/vulkan/anv_descriptor_set.c +++ b/src/intel/vulkan/anv_descriptor_set.c @@ -58,6 +58,9 @@ void anv_GetDescriptorSetLayoutSupport( anv_foreach_stage(s, binding->stageFlags) surface_count[s] += sampler->n_planes; } + } else { + anv_foreach_stage(s, binding->stageFlags) + surface_count[s] += binding->descriptorCount; } break; @@ -70,10 +73,10 @@ void anv_GetDescriptorSetLayoutSupport( bool supported = true; for (unsigned s = 0; s < MESA_SHADER_STAGES; s++) { - /* Our maximum binding table size is 250 and we need to reserve 8 for - * render targets. 240 is a nice round number. + /* Our maximum binding table size is 240 and we need to reserve 8 for + * render targets. */ - if (surface_count[s] >= 240) + if (surface_count[s] >= MAX_BINDING_TABLE_SIZE - MAX_RTS) supported = false; } @@ -458,6 +461,8 @@ VkResult anv_CreateDescriptorPool( &device->surface_state_pool, 4096); pool->surface_state_free_list = NULL; + list_inithead(&pool->desc_sets); + *pDescriptorPool = anv_descriptor_pool_to_handle(pool); return VK_SUCCESS; @@ -474,7 +479,13 @@ void anv_DestroyDescriptorPool( if (!pool) return; + list_for_each_entry_safe(struct anv_descriptor_set, set, + &pool->desc_sets, pool_link) { + anv_descriptor_set_destroy(device, pool, set); + } + anv_state_stream_finish(&pool->surface_state_stream); + vk_free2(&device->alloc, pAllocator, pool); } @@ -486,6 +497,11 @@ VkResult anv_ResetDescriptorPool( ANV_FROM_HANDLE(anv_device, device, _device); ANV_FROM_HANDLE(anv_descriptor_pool, pool, descriptorPool); + list_for_each_entry_safe(struct anv_descriptor_set, set, + &pool->desc_sets, pool_link) { + anv_descriptor_set_destroy(device, pool, set); + } + pool->next = 0; pool->free_list = EMPTY; anv_state_stream_finish(&pool->surface_state_stream); @@ -630,6 +646,8 @@ anv_descriptor_set_destroy(struct anv_device *device, entry->size = set->size; pool->free_list = (char *) entry - pool->data; } + + list_del(&set->pool_link); } VkResult anv_AllocateDescriptorSets( @@ -652,6 +670,8 @@ VkResult anv_AllocateDescriptorSets( if (result != VK_SUCCESS) break; + list_addtail(&set->pool_link, &pool->desc_sets); + pDescriptorSets[i] = anv_descriptor_set_to_handle(set); } diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index f44b046cf5d..99b512a0387 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -128,6 +128,8 @@ anv_physical_device_init_heaps(struct anv_physical_device *device, int fd) */ device->memory.heap_count = 1; device->memory.heaps[0] = (struct anv_memory_heap) { + .vma_start = LOW_HEAP_MIN_ADDRESS, + .vma_size = LOW_HEAP_SIZE, .size = heap_size, .flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, .supports_48bit_addresses = false, @@ -147,11 +149,19 @@ anv_physical_device_init_heaps(struct anv_physical_device *device, int fd) device->memory.heap_count = 2; device->memory.heaps[0] = (struct anv_memory_heap) { + .vma_start = HIGH_HEAP_MIN_ADDRESS, + /* Leave the last 4GiB out of the high vma range, so that no state + * base address + size can overflow 48 bits. For more information see + * the comment about Wa32bitGeneralStateOffset in anv_allocator.c + */ + .vma_size = gtt_size - (1ull << 32) - HIGH_HEAP_MIN_ADDRESS, .size = heap_size_48bit, .flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, .supports_48bit_addresses = true, }; device->memory.heaps[1] = (struct anv_memory_heap) { + .vma_start = LOW_HEAP_MIN_ADDRESS, + .vma_size = LOW_HEAP_SIZE, .size = heap_size_32bit, .flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, .supports_48bit_addresses = false, @@ -1029,7 +1039,7 @@ void anv_GetPhysicalDeviceProperties( .maxPerStageDescriptorSampledImages = max_samplers, .maxPerStageDescriptorStorageImages = max_images, .maxPerStageDescriptorInputAttachments = 64, - .maxPerStageResources = 250, + .maxPerStageResources = MAX_BINDING_TABLE_SIZE - MAX_RTS, .maxDescriptorSetSamplers = 6 * max_samplers, /* number of stages * maxPerStageDescriptorSamplers */ .maxDescriptorSetUniformBuffers = 6 * 64, /* number of stages * maxPerStageDescriptorUniformBuffers */ .maxDescriptorSetUniformBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2, @@ -1068,7 +1078,7 @@ void anv_GetPhysicalDeviceProperties( 16 * devinfo->max_cs_threads, 16 * devinfo->max_cs_threads, }, - .subPixelPrecisionBits = 4 /* FIXME */, + .subPixelPrecisionBits = 8, .subTexelPrecisionBits = 4 /* FIXME */, .mipmapPrecisionBits = 4 /* FIXME */, .maxDrawIndexedIndexValue = UINT32_MAX, @@ -1806,18 +1816,16 @@ VkResult anv_CreateDevice( } /* keep the page with address zero out of the allocator */ - util_vma_heap_init(&device->vma_lo, LOW_HEAP_MIN_ADDRESS, LOW_HEAP_SIZE); - device->vma_lo_available = - physical_device->memory.heaps[physical_device->memory.heap_count - 1].size; - - /* Leave the last 4GiB out of the high vma range, so that no state base - * address + size can overflow 48 bits. For more information see the - * comment about Wa32bitGeneralStateOffset in anv_allocator.c - */ - util_vma_heap_init(&device->vma_hi, HIGH_HEAP_MIN_ADDRESS, - HIGH_HEAP_SIZE); + struct anv_memory_heap *low_heap = + &physical_device->memory.heaps[physical_device->memory.heap_count - 1]; + util_vma_heap_init(&device->vma_lo, low_heap->vma_start, low_heap->vma_size); + device->vma_lo_available = low_heap->size; + + struct anv_memory_heap *high_heap = + &physical_device->memory.heaps[0]; + util_vma_heap_init(&device->vma_hi, high_heap->vma_start, high_heap->vma_size); device->vma_hi_available = physical_device->memory.heap_count == 1 ? 0 : - physical_device->memory.heaps[0].size; + high_heap->size; } /* As per spec, the driver implementation may deny requests to acquire @@ -1866,7 +1874,7 @@ VkResult anv_CreateDevice( result = vk_error(VK_ERROR_INITIALIZATION_FAILED); goto fail_mutex; } - if (pthread_cond_init(&device->queue_submit, NULL) != 0) { + if (pthread_cond_init(&device->queue_submit, &condattr) != 0) { pthread_condattr_destroy(&condattr); result = vk_error(VK_ERROR_INITIALIZATION_FAILED); goto fail_mutex; @@ -2276,8 +2284,11 @@ anv_vma_free(struct anv_device *device, struct anv_bo *bo) util_vma_heap_free(&device->vma_lo, addr_48b, bo->size); device->vma_lo_available += bo->size; } else { - assert(addr_48b >= HIGH_HEAP_MIN_ADDRESS && - addr_48b <= HIGH_HEAP_MAX_ADDRESS); + MAYBE_UNUSED const struct anv_physical_device *physical_device = + &device->instance->physicalDevice; + assert(addr_48b >= physical_device->memory.heaps[0].vma_start && + addr_48b < (physical_device->memory.heaps[0].vma_start + + physical_device->memory.heaps[0].vma_size)); util_vma_heap_free(&device->vma_hi, addr_48b, bo->size); device->vma_hi_available += bo->size; } diff --git a/src/intel/vulkan/anv_extensions.py b/src/intel/vulkan/anv_extensions.py index 22bad94e5b8..577ed111a2a 100644 --- a/src/intel/vulkan/anv_extensions.py +++ b/src/intel/vulkan/anv_extensions.py @@ -128,14 +128,15 @@ def __init__(self, version, enable): 'device->has_context_priority'), Extension('VK_EXT_pci_bus_info', 2, True), Extension('VK_EXT_post_depth_coverage', 1, 'device->info.gen >= 9'), + Extension('VK_EXT_queue_family_foreign', 1, 'ANDROID'), Extension('VK_EXT_sampler_filter_minmax', 1, 'device->info.gen >= 9'), - Extension('VK_EXT_scalar_block_layout', 1, True), + Extension('VK_EXT_scalar_block_layout', 1, '!ANDROID'), Extension('VK_EXT_shader_viewport_index_layer', 1, True), Extension('VK_EXT_shader_stencil_export', 1, 'device->info.gen >= 9'), Extension('VK_EXT_transform_feedback', 1, True), Extension('VK_EXT_vertex_attribute_divisor', 3, True), - Extension('VK_GOOGLE_decorate_string', 1, True), - Extension('VK_GOOGLE_hlsl_functionality1', 1, True), + Extension('VK_GOOGLE_decorate_string', 1, '!ANDROID'), + Extension('VK_GOOGLE_hlsl_functionality1', 1, '!ANDROID'), ] class VkVersion: diff --git a/src/intel/vulkan/anv_intel.c b/src/intel/vulkan/anv_intel.c index 08bff9585bc..f6b9584b410 100644 --- a/src/intel/vulkan/anv_intel.c +++ b/src/intel/vulkan/anv_intel.c @@ -64,7 +64,8 @@ VkResult anv_CreateDmaBufImageINTEL( .samples = 1, /* FIXME: Need a way to use X tiling to allow scanout */ .tiling = VK_IMAGE_TILING_OPTIMAL, - .usage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, + .usage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | + VK_IMAGE_USAGE_SAMPLED_BIT, .flags = 0, }}, pAllocator, &image_h); diff --git a/src/intel/vulkan/anv_nir_lower_ycbcr_textures.c b/src/intel/vulkan/anv_nir_lower_ycbcr_textures.c index 104c58dc5e2..0567a1be939 100644 --- a/src/intel/vulkan/anv_nir_lower_ycbcr_textures.c +++ b/src/intel/vulkan/anv_nir_lower_ycbcr_textures.c @@ -269,6 +269,7 @@ create_plane_tex_instr_implicit(struct ycbcr_state *state, tex->texture_index = old_tex->texture_index; tex->texture_array_size = old_tex->texture_array_size; tex->sampler_index = old_tex->sampler_index; + tex->is_array = old_tex->is_array; nir_ssa_dest_init(&tex->instr, &tex->dest, old_tex->dest.ssa.num_components, diff --git a/src/intel/vulkan/anv_pass.c b/src/intel/vulkan/anv_pass.c index 02f2be60e02..a1c72395831 100644 --- a/src/intel/vulkan/anv_pass.c +++ b/src/intel/vulkan/anv_pass.c @@ -178,12 +178,28 @@ anv_render_pass_compile(struct anv_render_pass *pass) * subpasses and checking to see if any of them don't have an external * dependency. Or, we could just be lazy and add a couple extra flushes. * We choose to be lazy. + * + * From the documentation for vkCmdNextSubpass: + * + * "Moving to the next subpass automatically performs any multisample + * resolve operations in the subpass being ended. End-of-subpass + * multisample resolves are treated as color attachment writes for the + * purposes of synchronization. This applies to resolve operations for + * both color and depth/stencil attachments. That is, they are + * considered to execute in the + * VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT pipeline stage and + * their writes are synchronized with + * VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT." + * + * Therefore, the above flags concerning color attachments also apply to + * color and depth/stencil resolve attachments. */ if (all_usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) { pass->subpass_flushes[0] |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT; } - if (all_usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) { + if (all_usage & (VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | + VK_IMAGE_USAGE_TRANSFER_DST_BIT)) { pass->subpass_flushes[pass->subpass_count] |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT; } diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c index be869cfa061..1bdc896e708 100644 --- a/src/intel/vulkan/anv_pipeline.c +++ b/src/intel/vulkan/anv_pipeline.c @@ -377,12 +377,12 @@ populate_wm_prog_key(const struct gen_device_info *devinfo, * harmless to compute it and then let dead-code take care of it. */ if (ms_info->rasterizationSamples > 1) { - key->persample_interp = + key->persample_interp = ms_info->sampleShadingEnable && (ms_info->minSampleShading * ms_info->rasterizationSamples) > 1; key->multisample_fbo = true; } - key->frag_coord_adds_sample_pos = ms_info->sampleShadingEnable; + key->frag_coord_adds_sample_pos = key->persample_interp; } } diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 110b2ccf023..9979b832a7b 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -120,12 +120,9 @@ struct gen_l3_config; #define INSTRUCTION_STATE_POOL_MIN_ADDRESS 0x000180000000ULL /* 6 GiB */ #define INSTRUCTION_STATE_POOL_MAX_ADDRESS 0x0001bfffffffULL #define HIGH_HEAP_MIN_ADDRESS 0x0001c0000000ULL /* 7 GiB */ -#define HIGH_HEAP_MAX_ADDRESS 0xfffeffffffffULL #define LOW_HEAP_SIZE \ (LOW_HEAP_MAX_ADDRESS - LOW_HEAP_MIN_ADDRESS + 1) -#define HIGH_HEAP_SIZE \ - (HIGH_HEAP_MAX_ADDRESS - HIGH_HEAP_MIN_ADDRESS + 1) #define DYNAMIC_STATE_POOL_SIZE \ (DYNAMIC_STATE_POOL_MAX_ADDRESS - DYNAMIC_STATE_POOL_MIN_ADDRESS + 1) #define BINDING_TABLE_POOL_SIZE \ @@ -163,6 +160,18 @@ struct gen_l3_config; #define MAX_GEN8_IMAGES 8 #define MAX_PUSH_DESCRIPTORS 32 /* Minimum requirement */ +/* From the Skylake PRM Vol. 7 "Binding Table Surface State Model": + * + * "The surface state model is used when a Binding Table Index (specified + * in the message descriptor) of less than 240 is specified. In this model, + * the Binding Table Index is used to index into the binding table, and the + * binding table entry contains a pointer to the SURFACE_STATE." + * + * Binding table values above 240 are used for various things in the hardware + * such as stateless, stateless with incoherent cache, SLM, and bindless. + */ +#define MAX_BINDING_TABLE_SIZE 240 + /* The kernel relocation API has a limitation of a 32-bit delta value * applied to the address before it is written which, in spite of it being * unsigned, is treated as signed . Because of the way that this maps to @@ -733,7 +742,7 @@ struct anv_state_table { struct anv_free_entry *map; uint32_t size; struct anv_block_state state; - struct u_vector mmap_cleanups; + struct u_vector cleanups; }; struct anv_state_pool { @@ -894,6 +903,8 @@ struct anv_memory_heap { VkMemoryHeapFlags flags; /* Driver-internal book-keeping */ + uint64_t vma_start; + uint64_t vma_size; bool supports_48bit_addresses; }; @@ -1449,10 +1460,10 @@ _anv_combine_address(struct anv_batch *batch, void *location, */ /* TC=LLC/eLLC, LeCC=WB, LRUM=3, L3CC=WB */ -#define GEN9_MOCS 2 +#define GEN9_MOCS (2 << 1) /* TC=LLC/eLLC, LeCC=WB, LRUM=3, L3CC=WB */ -#define GEN9_EXTERNAL_MOCS 1 +#define GEN9_EXTERNAL_MOCS (1 << 1) /* Cannonlake MOCS defines are duplicates of Skylake MOCS defines. */ #define GEN10_MOCS GEN9_MOCS @@ -1581,6 +1592,10 @@ struct anv_descriptor_set { uint32_t size; uint32_t buffer_count; struct anv_buffer_view *buffer_views; + + /* Link to descriptor pool's desc_sets list . */ + struct list_head pool_link; + struct anv_descriptor descriptors[0]; }; @@ -1614,6 +1629,8 @@ struct anv_descriptor_pool { struct anv_state_stream surface_state_stream; void *surface_state_free_list; + struct list_head desc_sets; + char data[0]; }; @@ -3045,7 +3062,13 @@ anv_can_sample_with_hiz(const struct gen_device_info * const devinfo, if (!(image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) return false; - if (devinfo->gen < 8) + /* Allow this feature on BDW even though it is disabled in the BDW devinfo + * struct. There's documentation which suggests that this feature actually + * reduces performance on BDW, but it has only been observed to help so + * far. Sampling fast-cleared blocks on BDW must also be handled with care + * (see depth_stencil_attachment_compute_aux_usage() for more info). + */ + if (devinfo->gen != 8 && !devinfo->has_sample_with_hiz) return false; return image->samples == 1; diff --git a/src/intel/vulkan/gen7_cmd_buffer.c b/src/intel/vulkan/gen7_cmd_buffer.c index 352892aee33..380283bdd56 100644 --- a/src/intel/vulkan/gen7_cmd_buffer.c +++ b/src/intel/vulkan/gen7_cmd_buffer.c @@ -70,12 +70,36 @@ gen7_cmd_buffer_emit_scissor(struct anv_cmd_buffer *cmd_buffer) }; const int max = 0xffff; + + uint32_t y_min = s->offset.y; + uint32_t x_min = s->offset.x; + uint32_t y_max = s->offset.y + s->extent.height - 1; + uint32_t x_max = s->offset.x + s->extent.width - 1; + + /* Do this math using int64_t so overflow gets clamped correctly. */ + if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) { + y_min = clamp_int64((uint64_t) y_min, + cmd_buffer->state.render_area.offset.y, max); + x_min = clamp_int64((uint64_t) x_min, + cmd_buffer->state.render_area.offset.x, max); + y_max = clamp_int64((uint64_t) y_max, 0, + cmd_buffer->state.render_area.offset.y + + cmd_buffer->state.render_area.extent.height - 1); + x_max = clamp_int64((uint64_t) x_max, 0, + cmd_buffer->state.render_area.offset.x + + cmd_buffer->state.render_area.extent.width - 1); + } else if (fb) { + y_min = clamp_int64((uint64_t) y_min, 0, max); + x_min = clamp_int64((uint64_t) x_min, 0, max); + y_max = clamp_int64((uint64_t) y_max, 0, fb->height - 1); + x_max = clamp_int64((uint64_t) x_max, 0, fb->width - 1); + } + struct GEN7_SCISSOR_RECT scissor = { - /* Do this math using int64_t so overflow gets clamped correctly. */ - .ScissorRectangleYMin = clamp_int64(s->offset.y, 0, max), - .ScissorRectangleXMin = clamp_int64(s->offset.x, 0, max), - .ScissorRectangleYMax = clamp_int64((uint64_t) s->offset.y + s->extent.height - 1, 0, fb->height - 1), - .ScissorRectangleXMax = clamp_int64((uint64_t) s->offset.x + s->extent.width - 1, 0, fb->width - 1) + .ScissorRectangleYMin = y_min, + .ScissorRectangleXMin = x_min, + .ScissorRectangleYMax = y_max, + .ScissorRectangleXMax = x_max }; if (s->extent.width <= 0 || s->extent.height <= 0) { diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index d980ec428d0..a3994f5870c 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -2653,7 +2653,7 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) { sob.SOBufferIndex = idx; - if (cmd_buffer->state.xfb_enabled && xfb->buffer) { + if (cmd_buffer->state.xfb_enabled && xfb->buffer && xfb->size != 0) { sob.SOBufferEnable = true; sob.MOCS = cmd_buffer->device->default_mocs, sob.StreamOffsetWriteEnable = false; diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c index d2142ae42c2..3e13a12d776 100644 --- a/src/intel/vulkan/genX_pipeline.c +++ b/src/intel/vulkan/genX_pipeline.c @@ -464,6 +464,7 @@ emit_rs_state(struct anv_pipeline *pipeline, sf.TriangleStripListProvokingVertexSelect = 0; sf.LineStripListProvokingVertexSelect = 0; sf.TriangleFanProvokingVertexSelect = 1; + sf.VertexSubPixelPrecisionSelect = _8Bit; const struct brw_vue_prog_data *last_vue_prog_data = anv_pipeline_get_last_vue_prog_data(pipeline); @@ -1077,6 +1078,10 @@ emit_3dstate_clip(struct anv_pipeline *pipeline, clip.APIMode = APIMODE_D3D, clip.ViewportXYClipTestEnable = true; +#if GEN_GEN >= 8 + clip.VertexSubPixelPrecisionSelect = _8Bit; +#endif + clip.ClipMode = CLIPMODE_NORMAL; clip.TriangleStripListProvokingVertexSelect = 0; @@ -1211,13 +1216,30 @@ emit_3dstate_streamout(struct anv_pipeline *pipeline, hole_dwords -= 4; } + int varying = output->location; + uint8_t component_mask = output->component_mask; + /* VARYING_SLOT_PSIZ contains three scalar fields packed together: + * - VARYING_SLOT_LAYER in VARYING_SLOT_PSIZ.y + * - VARYING_SLOT_VIEWPORT in VARYING_SLOT_PSIZ.z + * - VARYING_SLOT_PSIZ in VARYING_SLOT_PSIZ.w + */ + if (varying == VARYING_SLOT_LAYER) { + varying = VARYING_SLOT_PSIZ; + component_mask = 1 << 1; // SO_DECL_COMPMASK_Y + } else if (varying == VARYING_SLOT_VIEWPORT) { + varying = VARYING_SLOT_PSIZ; + component_mask = 1 << 2; // SO_DECL_COMPMASK_Z + } else if (varying == VARYING_SLOT_PSIZ) { + component_mask = 1 << 3; // SO_DECL_COMPMASK_W + } + next_offset[buffer] = output->offset + - __builtin_popcount(output->component_mask) * 4; + __builtin_popcount(component_mask) * 4; so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) { .OutputBufferSlot = buffer, - .RegisterIndex = vue_map->varying_to_slot[output->location], - .ComponentMask = output->component_mask, + .RegisterIndex = vue_map->varying_to_slot[varying], + .ComponentMask = component_mask, }; } @@ -2065,9 +2087,29 @@ compute_pipeline_create( vfe.URBEntryAllocationSize = GEN_GEN <= 7 ? 0 : 2; vfe.CURBEAllocationSize = vfe_curbe_allocation; - vfe.PerThreadScratchSpace = get_scratch_space(cs_bin); - vfe.ScratchSpaceBasePointer = - get_scratch_address(pipeline, MESA_SHADER_COMPUTE, cs_bin); + if (cs_bin->prog_data->total_scratch) { + if (GEN_GEN >= 8) { + /* Broadwell's Per Thread Scratch Space is in the range [0, 11] + * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M. + */ + vfe.PerThreadScratchSpace = + ffs(cs_bin->prog_data->total_scratch) - 11; + } else if (GEN_IS_HASWELL) { + /* Haswell's Per Thread Scratch Space is in the range [0, 10] + * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M. + */ + vfe.PerThreadScratchSpace = + ffs(cs_bin->prog_data->total_scratch) - 12; + } else { + /* IVB and BYT use the range [0, 11] to mean [1kB, 12kB] + * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB. + */ + vfe.PerThreadScratchSpace = + cs_bin->prog_data->total_scratch / 1024 - 1; + } + vfe.ScratchSpaceBasePointer = + get_scratch_address(pipeline, MESA_SHADER_COMPUTE, cs_bin); + } } struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = { diff --git a/src/intel/vulkan/genX_query.c b/src/intel/vulkan/genX_query.c index 794d92dc6c9..6c1c76aeef0 100644 --- a/src/intel/vulkan/genX_query.c +++ b/src/intel/vulkan/genX_query.c @@ -356,14 +356,23 @@ emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer, } static void -emit_query_availability(struct anv_cmd_buffer *cmd_buffer, - struct anv_address addr) +emit_query_mi_availability(struct anv_cmd_buffer *cmd_buffer, + struct anv_address addr, + bool available) +{ + genX(cmd_buffer_mi_memset)(cmd_buffer, addr, available, 8); +} + +static void +emit_query_pc_availability(struct anv_cmd_buffer *cmd_buffer, + struct anv_address addr, + bool available) { anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { pc.DestinationAddressType = DAT_PPGTT; pc.PostSyncOperation = WriteImmediateData; pc.Address = addr; - pc.ImmediateData = 1; + pc.ImmediateData = available; } } @@ -376,12 +385,40 @@ emit_zero_queries(struct anv_cmd_buffer *cmd_buffer, struct anv_query_pool *pool, uint32_t first_index, uint32_t num_queries) { - for (uint32_t i = 0; i < num_queries; i++) { - struct anv_address slot_addr = - anv_query_address(pool, first_index + i); - genX(cmd_buffer_mi_memset)(cmd_buffer, anv_address_add(slot_addr, 8), - 0, pool->stride - 8); - emit_query_availability(cmd_buffer, slot_addr); + switch (pool->type) { + case VK_QUERY_TYPE_OCCLUSION: + case VK_QUERY_TYPE_TIMESTAMP: + /* These queries are written with a PIPE_CONTROL so clear them using the + * PIPE_CONTROL as well so we don't have to synchronize between 2 types + * of operations. + */ + assert((pool->stride % 8) == 0); + for (uint32_t i = 0; i < num_queries; i++) { + struct anv_address slot_addr = + anv_query_address(pool, first_index + i); + + for (uint32_t qword = 1; qword < (pool->stride / 8); qword++) { + emit_query_pc_availability(cmd_buffer, + anv_address_add(slot_addr, qword * 8), + false); + } + emit_query_pc_availability(cmd_buffer, slot_addr, true); + } + break; + + case VK_QUERY_TYPE_PIPELINE_STATISTICS: + case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: + for (uint32_t i = 0; i < num_queries; i++) { + struct anv_address slot_addr = + anv_query_address(pool, first_index + i); + genX(cmd_buffer_mi_memset)(cmd_buffer, anv_address_add(slot_addr, 8), + 0, pool->stride - 8); + emit_query_mi_availability(cmd_buffer, slot_addr, true); + } + break; + + default: + unreachable("Unsupported query type"); } } @@ -394,11 +431,28 @@ void genX(CmdResetQueryPool)( ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); - for (uint32_t i = 0; i < queryCount; i++) { - anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdm) { - sdm.Address = anv_query_address(pool, firstQuery + i); - sdm.ImmediateData = 0; + switch (pool->type) { + case VK_QUERY_TYPE_OCCLUSION: + case VK_QUERY_TYPE_TIMESTAMP: + for (uint32_t i = 0; i < queryCount; i++) { + emit_query_pc_availability(cmd_buffer, + anv_query_address(pool, firstQuery + i), + false); + } + break; + + case VK_QUERY_TYPE_PIPELINE_STATISTICS: + case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: { + for (uint32_t i = 0; i < queryCount; i++) { + emit_query_mi_availability(cmd_buffer, + anv_query_address(pool, firstQuery + i), + false); } + break; + } + + default: + unreachable("Unsupported query type"); } } @@ -511,9 +565,9 @@ void genX(CmdBeginQueryIndexedEXT)( void genX(CmdEndQuery)( VkCommandBuffer commandBuffer, VkQueryPool queryPool, - VkQueryControlFlags flags) + uint32_t query) { - genX(CmdEndQueryIndexedEXT)(commandBuffer, queryPool, flags, 0); + genX(CmdEndQueryIndexedEXT)(commandBuffer, queryPool, query, 0); } void genX(CmdEndQueryIndexedEXT)( @@ -529,7 +583,7 @@ void genX(CmdEndQueryIndexedEXT)( switch (pool->type) { case VK_QUERY_TYPE_OCCLUSION: emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 16)); - emit_query_availability(cmd_buffer, query_addr); + emit_query_pc_availability(cmd_buffer, query_addr, true); break; case VK_QUERY_TYPE_PIPELINE_STATISTICS: { @@ -548,7 +602,7 @@ void genX(CmdEndQueryIndexedEXT)( offset += 16; } - emit_query_availability(cmd_buffer, query_addr); + emit_query_mi_availability(cmd_buffer, query_addr, true); break; } @@ -559,7 +613,7 @@ void genX(CmdEndQueryIndexedEXT)( } emit_xfb_query(cmd_buffer, index, anv_address_add(query_addr, 16)); - emit_query_availability(cmd_buffer, query_addr); + emit_query_mi_availability(cmd_buffer, query_addr, true); break; default: @@ -614,7 +668,7 @@ void genX(CmdWriteTimestamp)( break; } - emit_query_availability(cmd_buffer, query_addr); + emit_query_pc_availability(cmd_buffer, query_addr, true); /* When multiview is active the spec requires that N consecutive query * indices are used, where N is the number of active views in the subpass. @@ -817,7 +871,20 @@ void genX(CmdCopyQueryPoolResults)( } if ((flags & VK_QUERY_RESULT_WAIT_BIT) || - (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS)) { + (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) || + /* Occlusion & timestamp queries are written using a PIPE_CONTROL and + * because we're about to copy values from MI commands, we need to + * stall the command streamer to make sure the PIPE_CONTROL values have + * landed, otherwise we could see inconsistent values & availability. + * + * From the vulkan spec: + * + * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of + * previous uses of vkCmdResetQueryPool in the same queue, without + * any additional synchronization." + */ + pool->type == VK_QUERY_TYPE_OCCLUSION || + pool->type == VK_QUERY_TYPE_TIMESTAMP) { cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT; genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); } diff --git a/src/intel/vulkan/meson.build b/src/intel/vulkan/meson.build index 05fdeca8c25..af1223ad3b9 100644 --- a/src/intel/vulkan/meson.build +++ b/src/intel/vulkan/meson.build @@ -1,4 +1,4 @@ -# Copyright © 2017-2018 Intel Corporation +# Copyright © 2017-2019 Intel Corporation # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -105,7 +105,7 @@ foreach g : [['70', ['gen7_cmd_buffer.c']], ['75', ['gen7_cmd_buffer.c']], c_vis_args, no_override_init_args, c_sse2_args, '-DGEN_VERSIONx10=@0@'.format(_gen), ], - dependencies : [dep_libdrm, dep_valgrind, idep_nir_headers], + dependencies : [dep_libdrm, dep_valgrind, idep_nir_headers, idep_genxml], ) endforeach @@ -178,7 +178,10 @@ endif libanv_common = static_library( 'anv_common', - [libanv_files, anv_entrypoints, anv_extensions_c, anv_extensions_h, sha1_h], + [ + libanv_files, anv_entrypoints, anv_extensions_c, anv_extensions_h, sha1_h, + gen_xml_pack, + ], include_directories : [ inc_common, inc_intel, inc_compiler, inc_drm_uapi, inc_vulkan_util, inc_vulkan_wsi, @@ -200,7 +203,7 @@ libvulkan_intel = shared_library( libvulkan_util, libvulkan_wsi, libmesa_util, ], dependencies : [ - dep_thread, dep_dl, dep_m, anv_deps, idep_nir, + dep_thread, dep_dl, dep_m, anv_deps, idep_nir, idep_genxml, ], c_args : anv_flags, link_args : ['-Wl,--build-id=sha1', ld_args_bsymbolic, ld_args_gc_sections], diff --git a/src/loader/loader_dri3_helper.c b/src/loader/loader_dri3_helper.c index ad9b9d87b05..7d61c1df4fc 100644 --- a/src/loader/loader_dri3_helper.c +++ b/src/loader/loader_dri3_helper.c @@ -111,7 +111,7 @@ set_adaptive_sync_property(xcb_connection_t *conn, xcb_drawable_t drawable, xcb_intern_atom_reply_t* reply; xcb_void_cookie_t check; - cookie = xcb_intern_atom(conn, 0, sizeof(name), name); + cookie = xcb_intern_atom(conn, 0, strlen(name), name); reply = xcb_intern_atom_reply(conn, cookie, NULL); if (reply == NULL) return; diff --git a/src/mapi/es1api/meson.build b/src/mapi/es1api/meson.build index b0416e705a1..14ca49c1407 100644 --- a/src/mapi/es1api/meson.build +++ b/src/mapi/es1api/meson.build @@ -38,7 +38,7 @@ libglesv1_cm = shared_library( include_directories : [inc_src, inc_include, inc_mapi], link_with : libglapi, dependencies : [dep_thread, dep_libdrm, dep_m, dep_dl], - version : '1.0.0', + version : '1.1.0', install : true, ) diff --git a/src/mesa/drivers/dri/Android.mk b/src/mesa/drivers/dri/Android.mk index 53ff4b4f632..60c8476a38a 100644 --- a/src/mesa/drivers/dri/Android.mk +++ b/src/mesa/drivers/dri/Android.mk @@ -49,11 +49,19 @@ MESA_DRI_WHOLE_STATIC_LIBRARIES := \ MESA_DRI_SHARED_LIBRARIES := \ libcutils \ libdl \ - libexpat \ libglapi \ liblog \ libz +# If Android version >=8 MESA should static link libexpat else should dynamic link +ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0) +MESA_DRI_WHOLE_STATIC_LIBRARIES += \ + libexpat +else +MESA_DRI_SHARED_LIBRARIES += \ + libexpat +endif + #----------------------------------------------- # Build drivers and libmesa_dri_common diff --git a/src/mesa/drivers/dri/i965/Android.mk b/src/mesa/drivers/dri/i965/Android.mk index 1574c8834c9..97def8f03fe 100644 --- a/src/mesa/drivers/dri/i965/Android.mk +++ b/src/mesa/drivers/dri/i965/Android.mk @@ -274,6 +274,8 @@ LOCAL_LDFLAGS += $(MESA_DRI_LDFLAGS) LOCAL_CFLAGS := \ $(MESA_DRI_CFLAGS) +LOCAL_CFLAGS += -Wno-error + LOCAL_C_INCLUDES := \ $(MESA_DRI_C_INCLUDES) \ $(call generated-sources-dir-for,STATIC_LIBRARIES,libmesa_glsl,,) \ diff --git a/src/mesa/drivers/dri/i965/Makefile.am b/src/mesa/drivers/dri/i965/Makefile.am index b562c6ea21c..0bda2897e8e 100644 --- a/src/mesa/drivers/dri/i965/Makefile.am +++ b/src/mesa/drivers/dri/i965/Makefile.am @@ -34,6 +34,8 @@ AM_CFLAGS = \ -I$(top_builddir)/src/util \ -I$(top_srcdir)/src/mesa/drivers/dri/common \ -I$(top_srcdir)/src/gtest/include \ + -I$(top_builddir)/src/compiler \ + -I$(top_srcdir)/src/compiler \ -I$(top_builddir)/src/compiler/glsl \ -I$(top_builddir)/src/compiler/nir \ -I$(top_srcdir)/src/compiler/nir \ diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.c b/src/mesa/drivers/dri/i965/brw_bufmgr.c index f1675b191c1..43077e60da4 100644 --- a/src/mesa/drivers/dri/i965/brw_bufmgr.c +++ b/src/mesa/drivers/dri/i965/brw_bufmgr.c @@ -402,6 +402,8 @@ vma_alloc(struct brw_bufmgr *bufmgr, /* Without softpin support, we let the kernel assign addresses. */ assert(brw_using_softpin(bufmgr)); + alignment = ALIGN(alignment, PAGE_SIZE); + struct bo_cache_bucket *bucket = get_bucket_allocator(bufmgr, size); uint64_t addr; @@ -1487,7 +1489,7 @@ brw_bo_gem_export_to_prime(struct brw_bo *bo, int *prime_fd) brw_bo_make_external(bo); if (drmPrimeHandleToFD(bufmgr->fd, bo->gem_handle, - DRM_CLOEXEC, prime_fd) != 0) + DRM_CLOEXEC | DRM_RDWR, prime_fd) != 0) return -errno; bo->reusable = false; @@ -1717,6 +1719,9 @@ brw_bufmgr_init(struct gen_device_info *devinfo, int fd) const uint64_t _4GB = 4ull << 30; + /* The STATE_BASE_ADDRESS size field can only hold 1 page shy of 4GB */ + const uint64_t _4GB_minus_1 = _4GB - PAGE_SIZE; + if (devinfo->gen >= 8 && gtt_size > _4GB) { bufmgr->initial_kflags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS; @@ -1726,9 +1731,13 @@ brw_bufmgr_init(struct gen_device_info *devinfo, int fd) bufmgr->initial_kflags |= EXEC_OBJECT_PINNED; util_vma_heap_init(&bufmgr->vma_allocator[BRW_MEMZONE_LOW_4G], - PAGE_SIZE, _4GB); + PAGE_SIZE, _4GB_minus_1); + + /* Leave the last 4GB out of the high vma range, so that no state + * base address + size can overflow 48 bits. + */ util_vma_heap_init(&bufmgr->vma_allocator[BRW_MEMZONE_OTHER], - 1 * _4GB, gtt_size - 1 * _4GB); + 1 * _4GB, gtt_size - 2 * _4GB); } else if (devinfo->gen >= 10) { /* Softpin landed in 4.5, but GVT used an aliasing PPGTT until * kernel commit 6b3816d69628becb7ff35978aa0751798b4a940a in diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c index 505da9896b3..e4bc5fe99f0 100644 --- a/src/mesa/drivers/dri/i965/brw_context.c +++ b/src/mesa/drivers/dri/i965/brw_context.c @@ -893,6 +893,19 @@ brw_process_driconf_options(struct brw_context *brw) ctx->Const.dri_config_options_sha1 = ralloc_array(brw, unsigned char, 20); driComputeOptionsSha1(&brw->screen->optionCache, ctx->Const.dri_config_options_sha1); + + brw->screen->compiler->simd32_heuristics_control.grouped_sends_check = + driQueryOptionb(&brw->optionCache, "simd32_heuristic_grouped_check"); + brw->screen->compiler->simd32_heuristics_control.max_grouped_sends = + driQueryOptioni(&brw->optionCache, "simd32_heuristic_grouped_sends"); + brw->screen->compiler->simd32_heuristics_control.inst_count_check = + driQueryOptionb(&brw->optionCache, "simd32_heuristic_inst_check"); + brw->screen->compiler->simd32_heuristics_control.inst_count_ratio = + driQueryOptionf(&brw->optionCache, "simd32_heuristic_inst_ratio"); + brw->screen->compiler->simd32_heuristics_control.mrt_check = + driQueryOptionb(&brw->optionCache, "simd32_heuristic_mrt_check"); + brw->screen->compiler->simd32_heuristics_control.max_mrts = + driQueryOptioni(&brw->optionCache, "simd32_heuristic_max_mrts"); } GLboolean diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index 66fe5b3a8a0..7237f39d286 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -686,6 +686,7 @@ enum brw_query_kind { OA_COUNTERS, OA_COUNTERS_RAW, PIPELINE_STATS, + NULL_RENDERER, }; struct brw_perf_query_register_prog { @@ -1006,6 +1007,9 @@ struct brw_context /* High bits of the last seen index buffer address (for workarounds). */ uint16_t last_bo_high_bits; + + /* Used to understand is GPU state of primitive restart is up to date */ + bool enable_cut_index; } ib; /* Active vertex program: @@ -1246,6 +1250,7 @@ struct brw_context int n_active_oa_queries; int n_active_pipeline_stats_queries; + int n_active_null_renderers; /* The number of queries depending on running OA counters which * extends beyond brw_end_perf_query() since we need to wait until diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h index 2729a54e144..cdfa435a1f5 100644 --- a/src/mesa/drivers/dri/i965/brw_defines.h +++ b/src/mesa/drivers/dri/i965/brw_defines.h @@ -1652,11 +1652,17 @@ enum brw_pixel_shader_coverage_mask_mode { #define GEN10_CACHE_MODE_SS 0x0e420 #define GEN10_FLOAT_BLEND_OPTIMIZATION_ENABLE (1 << 4) -#define INSTPM 0x20c0 +#define INSTPM 0x20c0 /* Gen6-8 */ # define INSTPM_CONSTANT_BUFFER_ADDRESS_OFFSET_DISABLE (1 << 6) +# define INSTPM_GLOBAL_DEBUG_ENABLE (1 << 4) +# define INSTPM_MEDIA_INSTRUCTION_DISABLE (1 << 3) +# define INSTPM_3D_RENDERER_INSTRUCTION_DISABLE (1 << 2) +# define INSTPM_3D_STATE_INSTRUCTION_DISABLE (1 << 1) #define CS_DEBUG_MODE2 0x20d8 /* Gen9+ */ # define CSDBG2_CONSTANT_BUFFER_ADDRESS_OFFSET_DISABLE (1 << 4) +# define CSDBG2_MEDIA_INSTRUCTION_DISABLE (1 << 1) +# define CSDBG2_3D_RENDERER_INSTRUCTION_DISABLE (1 << 0) #define GEN7_RPSTAT1 0xA01C #define GEN7_RPSTAT1_CURR_GT_FREQ_SHIFT 7 diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c index dfbc45fe938..2f52899fcb0 100644 --- a/src/mesa/drivers/dri/i965/brw_draw_upload.c +++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c @@ -776,6 +776,14 @@ brw_upload_indices(struct brw_context *brw) brw->ib.index_size = index_buffer->index_size; brw->ctx.NewDriverState |= BRW_NEW_INDEX_BUFFER; } + + /* We need to re-emit an index buffer state each time + * when cut index flag is changed + */ + if (brw->prim_restart.enable_cut_index != brw->ib.enable_cut_index) { + brw->ib.enable_cut_index = brw->prim_restart.enable_cut_index; + brw->ctx.NewDriverState |= BRW_NEW_INDEX_BUFFER; + } } const struct brw_tracked_state brw_indices = { diff --git a/src/mesa/drivers/dri/i965/brw_link.cpp b/src/mesa/drivers/dri/i965/brw_link.cpp index 2cbb1e0b879..95d87dc56fd 100644 --- a/src/mesa/drivers/dri/i965/brw_link.cpp +++ b/src/mesa/drivers/dri/i965/brw_link.cpp @@ -323,7 +323,6 @@ brw_link_shader(struct gl_context *ctx, struct gl_shader_program *shProg) brw_shader_gather_info(prog->nir, prog); - NIR_PASS_V(prog->nir, gl_nir_lower_samplers, shProg); NIR_PASS_V(prog->nir, gl_nir_lower_atomics, shProg, false); NIR_PASS_V(prog->nir, nir_lower_atomics_to_ssbo, prog->nir->info.num_abos); diff --git a/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp b/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp index 256fdd8fc79..7e2a5b045dd 100644 --- a/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp +++ b/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp @@ -80,15 +80,15 @@ setup_vec4_image_param(uint32_t *params, uint32_t idx, } static void -brw_setup_image_uniform_values(gl_shader_stage stage, - struct brw_stage_prog_data *stage_prog_data, - unsigned param_start_index, - const gl_uniform_storage *storage) +brw_setup_image_uniform_values(nir_variable *var, + struct brw_stage_prog_data *prog_data) { - uint32_t *param = &stage_prog_data->param[param_start_index]; + unsigned param_start_index = var->data.driver_location / 4; + uint32_t *param = &prog_data->param[param_start_index]; + unsigned num_images = MAX2(1, var->type->arrays_of_arrays_size()); - for (unsigned i = 0; i < MAX2(storage->array_elements, 1); i++) { - const unsigned image_idx = storage->opaque[stage].index + i; + for (unsigned i = 0; i < num_images; i++) { + const unsigned image_idx = var->data.binding + i; /* Upload the brw_image_param structure. The order is expected to match * the BRW_IMAGE_PARAM_*_OFFSET defines. @@ -150,6 +150,14 @@ brw_nir_setup_glsl_uniform(gl_shader_stage stage, nir_variable *var, struct brw_stage_prog_data *stage_prog_data, bool is_scalar) { + if (var->type->without_array()->is_sampler()) + return; + + if (var->type->without_array()->is_image()) { + brw_setup_image_uniform_values(var, stage_prog_data); + return; + } + /* The data for our (non-builtin) uniforms is stored in a series of * gl_uniform_storage structs for each subcomponent that * glGetUniformLocation() could name. We know it's been set up in the same @@ -162,15 +170,17 @@ brw_nir_setup_glsl_uniform(gl_shader_stage stage, nir_variable *var, struct gl_uniform_storage *storage = &prog->sh.data->UniformStorage[var->data.location + u]; - if (storage->builtin || storage->type->is_sampler()) + /* We already handled samplers and images via the separate top-level + * variables created by gl_nir_lower_samplers_as_deref(), but they're + * still part of the structure's storage, and so we'll see them while + * walking it to set up the other regular fields. Just skip over them. + */ + if (storage->builtin || + storage->type->is_sampler() || + storage->type->is_image()) continue; - if (storage->type->is_image()) { - brw_setup_image_uniform_values(stage, stage_prog_data, - uniform_index, storage); - uniform_index += - BRW_IMAGE_PARAM_SIZE * MAX2(storage->array_elements, 1); - } else { + { gl_constant_value *components = storage->storage; unsigned vector_count = (MAX2(storage->array_elements, 1) * storage->type->matrix_columns); diff --git a/src/mesa/drivers/dri/i965/brw_performance_query.c b/src/mesa/drivers/dri/i965/brw_performance_query.c index 10e3d024f17..85d14a83c7e 100644 --- a/src/mesa/drivers/dri/i965/brw_performance_query.c +++ b/src/mesa/drivers/dri/i965/brw_performance_query.c @@ -330,6 +330,12 @@ dump_perf_query_callback(GLuint id, void *query_void, void *brw_void) o->Active ? "Active," : (o->Ready ? "Ready," : "Pending,"), obj->pipeline_stats.bo ? "yes" : "no"); break; + case NULL_RENDERER: + DBG("%4d: %-6s %-8s NULL_RENDERER\n", + id, + o->Used ? "Dirty," : "New,", + o->Active ? "Active," : (o->Ready ? "Ready," : "Pending,")); + break; default: unreachable("Unknown query type"); break; @@ -431,6 +437,10 @@ brw_get_perf_query_info(struct gl_context *ctx, *n_active = brw->perfquery.n_active_pipeline_stats_queries; break; + case NULL_RENDERER: + *n_active = brw->perfquery.n_active_null_renderers; + break; + default: unreachable("Unknown query type"); break; @@ -1020,6 +1030,7 @@ brw_begin_perf_query(struct gl_context *ctx, struct brw_context *brw = brw_context(ctx); struct brw_perf_query_object *obj = brw_perf_query(o); const struct brw_perf_query_info *query = obj->query; + const struct gen_device_info *devinfo = &brw->screen->devinfo; /* We can assume the frontend hides mistaken attempts to Begin a * query object multiple times before its End. Similarly if an @@ -1104,7 +1115,6 @@ brw_begin_perf_query(struct gl_context *ctx, /* If the OA counters aren't already on, enable them. */ if (brw->perfquery.oa_stream_fd == -1) { __DRIscreen *screen = brw->screen->driScrnPriv; - const struct gen_device_info *devinfo = &brw->screen->devinfo; /* The period_exponent gives a sampling period as follows: * sample_period = timestamp_period * 2^(period_exponent + 1) @@ -1250,6 +1260,23 @@ brw_begin_perf_query(struct gl_context *ctx, ++brw->perfquery.n_active_pipeline_stats_queries; break; + case NULL_RENDERER: + ++brw->perfquery.n_active_null_renderers; + if (devinfo->gen >= 9) { + brw_load_register_imm32(brw, CS_DEBUG_MODE2, + REG_MASK(CSDBG2_3D_RENDERER_INSTRUCTION_DISABLE) | + CSDBG2_3D_RENDERER_INSTRUCTION_DISABLE); + } else { + brw_load_register_imm32(brw, INSTPM, + REG_MASK(INSTPM_3D_RENDERER_INSTRUCTION_DISABLE | + INSTPM_MEDIA_INSTRUCTION_DISABLE) | + INSTPM_3D_RENDERER_INSTRUCTION_DISABLE | + INSTPM_MEDIA_INSTRUCTION_DISABLE); + } + brw_emit_pipe_control_flush(brw, + PIPE_CONTROL_LRI_WRITE_IMMEDIATE); + break; + default: unreachable("Unknown query type"); break; @@ -1270,6 +1297,7 @@ brw_end_perf_query(struct gl_context *ctx, { struct brw_context *brw = brw_context(ctx); struct brw_perf_query_object *obj = brw_perf_query(o); + const struct gen_device_info *devinfo = &brw->screen->devinfo; DBG("End(%d)\n", o->Id); @@ -1312,6 +1340,21 @@ brw_end_perf_query(struct gl_context *ctx, --brw->perfquery.n_active_pipeline_stats_queries; break; + case NULL_RENDERER: + if (--brw->perfquery.n_active_null_renderers == 0) { + if (devinfo->gen >= 9) { + brw_load_register_imm32(brw, CS_DEBUG_MODE2, + REG_MASK(CSDBG2_3D_RENDERER_INSTRUCTION_DISABLE)); + } else { + brw_load_register_imm32(brw, INSTPM, + REG_MASK(INSTPM_3D_RENDERER_INSTRUCTION_DISABLE | + INSTPM_MEDIA_INSTRUCTION_DISABLE)); + } + brw_emit_pipe_control_flush(brw, + PIPE_CONTROL_LRI_WRITE_IMMEDIATE); + } + break; + default: unreachable("Unknown query type"); break; @@ -1337,6 +1380,9 @@ brw_wait_perf_query(struct gl_context *ctx, struct gl_perf_query_object *o) bo = obj->pipeline_stats.bo; break; + case NULL_RENDERER: + break; + default: unreachable("Unknown query type"); break; @@ -1387,6 +1433,8 @@ brw_is_perf_query_ready(struct gl_context *ctx, return (obj->pipeline_stats.bo && !brw_batch_references(&brw->batch, obj->pipeline_stats.bo) && !brw_bo_busy(obj->pipeline_stats.bo)); + case NULL_RENDERER: + return true; default: unreachable("Unknown query type"); @@ -1602,6 +1650,9 @@ brw_get_perf_query_data(struct gl_context *ctx, written = get_pipeline_stats_data(brw, obj, data_size, (uint8_t *)data); break; + case NULL_RENDERER: + break; + default: unreachable("Unknown query type"); break; @@ -1672,6 +1723,9 @@ brw_delete_perf_query(struct gl_context *ctx, } break; + case NULL_RENDERER: + break; + default: unreachable("Unknown query type"); break; @@ -2152,6 +2206,15 @@ get_register_queries_function(const struct gen_device_info *devinfo) return NULL; } +static void +fill_null_renderer_perf_query_info(struct brw_context *brw, + struct brw_perf_query_info *query) +{ + query->kind = NULL_RENDERER; + query->name = "Intel_Null_Hardware_Query"; + query->n_counters = 0; +} + static unsigned brw_init_perf_query_info(struct gl_context *ctx) { @@ -2210,6 +2273,10 @@ brw_init_perf_query_info(struct gl_context *ctx) enumerate_sysfs_metrics(brw); brw_perf_query_register_mdapi_oa_query(brw); + + struct brw_perf_query_info *null_query = + brw_perf_query_append_query_info(brw); + fill_null_renderer_perf_query_info(brw, null_query); } brw->perfquery.unaccumulated = diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c index 9ab25cf664c..841b7df896d 100644 --- a/src/mesa/drivers/dri/i965/brw_program.c +++ b/src/mesa/drivers/dri/i965/brw_program.c @@ -42,7 +42,8 @@ #include "compiler/glsl/ir.h" #include "compiler/glsl/program.h" #include "compiler/glsl/glsl_to_nir.h" -#include "compiler/glsl/float64_glsl.h" +#include "compiler/glsl/gl_nir.h" +#include "glsl/float64_glsl.h" #include "brw_program.h" #include "brw_context.h" @@ -165,6 +166,9 @@ brw_create_nir(struct brw_context *brw, nir = brw_preprocess_nir(brw->screen->compiler, nir); + if (shader_prog) + NIR_PASS_V(nir, gl_nir_lower_samplers, shader_prog); + NIR_PASS_V(nir, brw_nir_lower_image_load_store, devinfo); if (stage == MESA_SHADER_TESS_CTRL) { diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c index 7bbb6166344..9f88d625d63 100644 --- a/src/mesa/drivers/dri/i965/brw_wm.c +++ b/src/mesa/drivers/dri/i965/brw_wm.c @@ -309,6 +309,7 @@ brw_populate_sampler_prog_key_data(struct gl_context *ctx, const int s = u_bit_scan(&mask); key->swizzles[s] = SWIZZLE_NOOP; + key->scale_factors[s] = 0.0f; int unit_id = prog->SamplerUnits[s]; const struct gl_texture_unit *unit = &ctx->Texture.Unit[unit_id]; @@ -406,6 +407,10 @@ brw_populate_sampler_prog_key_data(struct gl_context *ctx, } if (t->Target == GL_TEXTURE_EXTERNAL_OES && intel_tex->planar_format) { + + /* Setup possible scaling factor. */ + key->scale_factors[s] = intel_tex->planar_format->scaling_factor; + switch (intel_tex->planar_format->components) { case __DRI_IMAGE_COMPONENTS_Y_UV: key->y_uv_image_mask |= 1 << s; diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c index b067a174056..8269056c74c 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c +++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c @@ -1681,6 +1681,11 @@ brw_upload_cs_work_groups_surface(struct brw_context *brw) ISL_FORMAT_RAW, 3 * sizeof(GLuint), 1, RELOC_WRITE); + + /* The state buffer now holds a reference to our upload, drop ours. */ + if (bo != brw->compute.num_work_groups_bo) + brw_bo_unreference(bo); + brw->ctx.NewDriverState |= BRW_NEW_SURFACES; } } diff --git a/src/mesa/drivers/dri/i965/genX_state_upload.c b/src/mesa/drivers/dri/i965/genX_state_upload.c index dcdfb3c9292..73c983ce742 100644 --- a/src/mesa/drivers/dri/i965/genX_state_upload.c +++ b/src/mesa/drivers/dri/i965/genX_state_upload.c @@ -998,7 +998,8 @@ genX(emit_index_buffer)(struct brw_context *brw) brw_batch_emit(brw, GENX(3DSTATE_INDEX_BUFFER), ib) { #if GEN_GEN < 8 && !GEN_IS_HASWELL - ib.CutIndexEnable = brw->prim_restart.enable_cut_index; + assert(brw->ib.enable_cut_index == brw->prim_restart.enable_cut_index); + ib.CutIndexEnable = brw->ib.enable_cut_index; #endif ib.IndexFormat = brw_get_index_type(index_buffer->index_size); @@ -2445,7 +2446,7 @@ set_scissor_bits(const struct gl_context *ctx, int i, bbox[0] = MAX2(ctx->ViewportArray[i].X, 0); bbox[1] = MIN2(bbox[0] + ctx->ViewportArray[i].Width, fb_width); - bbox[2] = MAX2(ctx->ViewportArray[i].Y, 0); + bbox[2] = CLAMP(ctx->ViewportArray[i].Y, 0, fb_height); bbox[3] = MIN2(bbox[2] + ctx->ViewportArray[i].Height, fb_height); _mesa_intersect_scissor_bounding_box(ctx, i, bbox); diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c index 7513d15c3dd..92ecd612006 100644 --- a/src/mesa/drivers/dri/i965/intel_extensions.c +++ b/src/mesa/drivers/dri/i965/intel_extensions.c @@ -182,14 +182,16 @@ intelInitExtensions(struct gl_context *ctx) ctx->Extensions.ARB_conditional_render_inverted = true; ctx->Extensions.ARB_cull_distance = true; ctx->Extensions.ARB_draw_buffers_blend = true; - if (ctx->API != API_OPENGL_COMPAT) + if (ctx->API != API_OPENGL_COMPAT || + ctx->Const.AllowHigherCompatVersion) ctx->Extensions.ARB_enhanced_layouts = true; ctx->Extensions.ARB_ES3_compatibility = true; ctx->Extensions.ARB_fragment_layer_viewport = true; ctx->Extensions.ARB_pipeline_statistics_query = true; ctx->Extensions.ARB_sample_shading = true; ctx->Extensions.ARB_shading_language_420pack = true; - if (ctx->API != API_OPENGL_COMPAT) { + if (ctx->API != API_OPENGL_COMPAT || + ctx->Const.AllowHigherCompatVersion) { ctx->Extensions.ARB_texture_buffer_object = true; ctx->Extensions.ARB_texture_buffer_object_rgb32 = true; ctx->Extensions.ARB_texture_buffer_range = true; @@ -199,7 +201,8 @@ intelInitExtensions(struct gl_context *ctx) ctx->Extensions.ARB_texture_multisample = true; ctx->Extensions.ARB_uniform_buffer_object = true; - if (ctx->API != API_OPENGL_COMPAT) + if (ctx->API != API_OPENGL_COMPAT || + ctx->Const.AllowHigherCompatVersion) ctx->Extensions.AMD_vertex_shader_layer = true; ctx->Extensions.EXT_framebuffer_multisample = true; ctx->Extensions.EXT_framebuffer_multisample_blit_scaled = true; @@ -228,7 +231,8 @@ intelInitExtensions(struct gl_context *ctx) ctx->Extensions.ARB_conservative_depth = true; ctx->Extensions.ARB_derivative_control = true; ctx->Extensions.ARB_framebuffer_no_attachments = true; - if (ctx->API != API_OPENGL_COMPAT) { + if (ctx->API != API_OPENGL_COMPAT || + ctx->Const.AllowHigherCompatVersion) { ctx->Extensions.ARB_gpu_shader5 = true; ctx->Extensions.ARB_gpu_shader_fp64 = true; } @@ -239,7 +243,8 @@ intelInitExtensions(struct gl_context *ctx) ctx->Extensions.ARB_shader_image_size = true; ctx->Extensions.ARB_shader_precision = true; ctx->Extensions.ARB_shader_texture_image_samples = true; - if (ctx->API != API_OPENGL_COMPAT) + if (ctx->API != API_OPENGL_COMPAT || + ctx->Const.AllowHigherCompatVersion) ctx->Extensions.ARB_tessellation_shader = true; ctx->Extensions.ARB_texture_compression_bptc = true; ctx->Extensions.ARB_texture_view = true; @@ -248,7 +253,6 @@ intelInitExtensions(struct gl_context *ctx) ctx->Extensions.EXT_shader_samples_identical = true; ctx->Extensions.OES_primitive_bounding_box = true; ctx->Extensions.OES_texture_buffer = true; - ctx->Extensions.ARB_fragment_shader_interlock = true; if (can_do_pipelined_register_writes(brw->screen)) { ctx->Extensions.ARB_draw_indirect = true; @@ -313,6 +317,30 @@ intelInitExtensions(struct gl_context *ctx) ctx->Extensions.KHR_blend_equation_advanced_coherent = true; ctx->Extensions.KHR_texture_compression_astc_ldr = true; ctx->Extensions.KHR_texture_compression_astc_sliced_3d = true; + + /* + * From the Skylake PRM Vol. 7 (Memory Fence Message, page 221): + * "A memory fence message issued by a thread causes further messages + * issued by the thread to be blocked until all previous data port + * messages have completed, or the results can be globally observed from + * the point of view of other threads in the system." + * + * From the Haswell PRM Vol. 7 (Memory Fence, page 256): + * "A memory fence message issued by a thread causes further messages + * issued by the thread to be blocked until all previous messages issued + * by the thread to that data port (data cache or render cache) have + * been globally observed from the point of view of other threads in the + * system." + * + * Summarized: For ARB_fragment_shader_interlock to work, we need to + * ensure memory access ordering for all messages to the dataport from + * all threads. Memory fence messages prior to SKL only provide memory + * access ordering for messages from the same thread, so we can only + * support the feature from Gen9 onwards. + * + */ + + ctx->Extensions.ARB_fragment_shader_interlock = true; } if (gen_device_info_is_9lp(devinfo)) @@ -321,7 +349,8 @@ intelInitExtensions(struct gl_context *ctx) if (devinfo->gen >= 6) ctx->Extensions.INTEL_performance_query = true; - if (ctx->API != API_OPENGL_COMPAT) + if (ctx->API != API_OPENGL_COMPAT || + ctx->Const.AllowHigherCompatVersion) ctx->Extensions.ARB_base_instance = true; if (ctx->API != API_OPENGL_CORE) ctx->Extensions.ARB_color_buffer_float = true; diff --git a/src/mesa/drivers/dri/i965/intel_image.h b/src/mesa/drivers/dri/i965/intel_image.h index ca604159dc2..4ab8a49b8bb 100644 --- a/src/mesa/drivers/dri/i965/intel_image.h +++ b/src/mesa/drivers/dri/i965/intel_image.h @@ -62,6 +62,7 @@ struct intel_image_format { uint32_t dri_format; int cpp; } planes[3]; + float scaling_factor; }; struct __DRIimageRec { diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c index 8838f977bb6..2436f48a065 100644 --- a/src/mesa/drivers/dri/i965/intel_screen.c +++ b/src/mesa/drivers/dri/i965/intel_screen.c @@ -61,6 +61,33 @@ DRI_CONF_BEGIN DRI_CONF_ENUM(1, "Enable reuse of all sizes of buffer objects") DRI_CONF_DESC_END DRI_CONF_OPT_END + + DRI_CONF_OPT_BEGIN_B(simd32_heuristic_grouped_check, "true") + DRI_CONF_DESC(en, "Enable/disable grouped texture fetch " + "check in the SIMD32 selection heuristic.") + DRI_CONF_OPT_END + DRI_CONF_OPT_BEGIN_V(simd32_heuristic_grouped_sends, int, 6, "1:999") + DRI_CONF_DESC(en, "How many grouped texture fetches should " + "the SIMD32 selection heuristic allow.") + DRI_CONF_OPT_END + DRI_CONF_OPT_BEGIN_B(simd32_heuristic_inst_check, "true") + DRI_CONF_DESC(en, "Enable/disable SIMD32/SIMD16 instruction " + "count ratio check in the SIMD32 selection " + "heuristic.") + DRI_CONF_OPT_END + DRI_CONF_OPT_BEGIN_V(simd32_heuristic_inst_ratio, float, 2.3, "1:999") + DRI_CONF_DESC(en, "SIMD32/SIMD16 instruction count ratio " + "the SIMD32 selection heuristic should allow.") + DRI_CONF_OPT_END + DRI_CONF_OPT_BEGIN_B(simd32_heuristic_mrt_check, "true") + DRI_CONF_DESC(en, "Enable/disable MRT write check in the " + "SIMD32 selection heuristic.") + DRI_CONF_OPT_END + DRI_CONF_OPT_BEGIN_V(simd32_heuristic_max_mrts, int, 1, "1:8") + DRI_CONF_DESC(en, "How many MRT writes should the SIMD32 " + "selection heuristic allow.") + DRI_CONF_OPT_END + DRI_CONF_MESA_NO_ERROR("false") DRI_CONF_SECTION_END @@ -282,6 +309,18 @@ static const struct intel_image_format intel_image_formats[] = { { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 1, 1, 1, __DRI_IMAGE_FORMAT_GR88, 2 } } }, + { __DRI_IMAGE_FOURCC_P010, __DRI_IMAGE_COMPONENTS_Y_UV, 2, + { { 0, 0, 0, __DRI_IMAGE_FORMAT_R16, 2 }, + { 1, 1, 1, __DRI_IMAGE_FORMAT_GR1616, 4 } } }, + + { __DRI_IMAGE_FOURCC_P012, __DRI_IMAGE_COMPONENTS_Y_UV, 2, + { { 0, 0, 0, __DRI_IMAGE_FORMAT_R16, 2 }, + { 1, 1, 1, __DRI_IMAGE_FORMAT_GR1616, 4 } } }, + + { __DRI_IMAGE_FOURCC_P016, __DRI_IMAGE_COMPONENTS_Y_UV, 2, + { { 0, 0, 0, __DRI_IMAGE_FORMAT_R16, 2 }, + { 1, 1, 1, __DRI_IMAGE_FORMAT_GR1616, 4 } } }, + { __DRI_IMAGE_FOURCC_NV16, __DRI_IMAGE_COMPONENTS_Y_UV, 2, { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, { 1, 1, 0, __DRI_IMAGE_FORMAT_GR88, 2 } } }, diff --git a/src/mesa/drivers/dri/i965/meson.build b/src/mesa/drivers/dri/i965/meson.build index cd3683ae7ec..0bc6125f19c 100644 --- a/src/mesa/drivers/dri/i965/meson.build +++ b/src/mesa/drivers/dri/i965/meson.build @@ -187,7 +187,7 @@ libi965 = static_library( i965_gen_libs, libintel_common, libintel_dev, libisl, libintel_compiler, libblorp ], - dependencies : [dep_libdrm, dep_valgrind, idep_nir_headers], + dependencies : [dep_libdrm, dep_valgrind, idep_nir_headers, idep_genxml], ) dri_drivers += libi965 diff --git a/src/mesa/drivers/dri/meson.build b/src/mesa/drivers/dri/meson.build index d98c823f5fe..dddc4ae3dfd 100644 --- a/src/mesa/drivers/dri/meson.build +++ b/src/mesa/drivers/dri/meson.build @@ -54,6 +54,10 @@ if dri_drivers != [] dep_selinux, dep_libdrm, dep_expat, dep_m, dep_thread, dep_dl, idep_nir, ], link_args : [ld_args_build_id, ld_args_bsymbolic, ld_args_gc_sections], + # Will be deleted during installation, see install_megadrivers.py + install : true, + install_dir : dri_drivers_path, + name_suffix : 'so', ) meson.add_install_script( @@ -78,7 +82,7 @@ if with_dri filebase : 'dri', description : 'Direct Rendering Infrastructure', version : meson.project_version(), - variables : ['dridriverdir=${prefix}/' + dri_drivers_path], + variables : ['dridriverdir=' + dri_drivers_path], requires_private : dri_req_private, ) endif diff --git a/src/mesa/drivers/osmesa/meson.build b/src/mesa/drivers/osmesa/meson.build index a406bb3c210..c479b740131 100644 --- a/src/mesa/drivers/osmesa/meson.build +++ b/src/mesa/drivers/osmesa/meson.build @@ -33,7 +33,8 @@ libosmesa = shared_library( include_directories : [ inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, ], - link_with : [libmesa_classic, libglapi_static, osmesa_link_with], + link_whole : libglapi_static, + link_with : [libmesa_classic, osmesa_link_with], dependencies : [dep_thread, dep_selinux], version : '8.0.0', install : true, diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c index 492f01de957..9ce8a94c5de 100644 --- a/src/mesa/main/context.c +++ b/src/mesa/main/context.c @@ -1760,6 +1760,10 @@ _mesa_make_current( struct gl_context *newCtx, check_init_viewport(newCtx, drawBuffer->Width, drawBuffer->Height); } + else { + _mesa_reference_framebuffer(&newCtx->WinSysDrawBuffer, NULL); + _mesa_reference_framebuffer(&newCtx->WinSysReadBuffer, NULL); + } if (newCtx->FirstTimeCurrent) { handle_first_current(newCtx); diff --git a/src/mesa/main/dlist.c b/src/mesa/main/dlist.c index 97461cede34..eb22fcbdb31 100644 --- a/src/mesa/main/dlist.c +++ b/src/mesa/main/dlist.c @@ -962,6 +962,8 @@ make_list(GLuint name, GLuint count) dlist->Name = name; dlist->Head = malloc(sizeof(Node) * count); dlist->Head[0].opcode = OPCODE_END_OF_LIST; + /* All InstSize[] entries must be non-zero */ + InstSize[OPCODE_END_OF_LIST] = 1; return dlist; } @@ -2753,6 +2755,7 @@ save_Fogiv(GLenum pname, const GLint *params) case GL_FOG_START: case GL_FOG_END: case GL_FOG_INDEX: + case GL_FOG_COORDINATE_SOURCE: p[0] = (GLfloat) *params; p[1] = 0.0f; p[2] = 0.0f; diff --git a/src/mesa/main/errors.c b/src/mesa/main/errors.c index a9687913627..30560ba047e 100644 --- a/src/mesa/main/errors.c +++ b/src/mesa/main/errors.c @@ -231,6 +231,9 @@ _mesa_gl_vdebug(struct gl_context *ctx, _mesa_debug_get_id(id); len = _mesa_vsnprintf(s, MAX_DEBUG_MESSAGE_LENGTH, fmtString, args); + if (len >= MAX_DEBUG_MESSAGE_LENGTH) + /* message was truncated */ + len = MAX_DEBUG_MESSAGE_LENGTH - 1; _mesa_log_msg(ctx, source, type, *id, severity, len, s); } diff --git a/src/mesa/main/fbobject.c b/src/mesa/main/fbobject.c index 8290ea94dfc..341fd93efc6 100644 --- a/src/mesa/main/fbobject.c +++ b/src/mesa/main/fbobject.c @@ -4663,8 +4663,12 @@ get_fb_attachment(struct gl_context *ctx, struct gl_framebuffer *fb, case GL_COLOR_ATTACHMENT12: case GL_COLOR_ATTACHMENT13: case GL_COLOR_ATTACHMENT14: - case GL_COLOR_ATTACHMENT15: - return &fb->Attachment[BUFFER_COLOR0 + attachment - GL_COLOR_ATTACHMENT0]; + case GL_COLOR_ATTACHMENT15: { + const unsigned i = attachment - GL_COLOR_ATTACHMENT0; + if (i >= ctx->Const.MaxColorAttachments) + return NULL; + return &fb->Attachment[BUFFER_COLOR0 + i]; + } case GL_DEPTH: case GL_DEPTH_ATTACHMENT: case GL_DEPTH_STENCIL_ATTACHMENT: @@ -4691,6 +4695,29 @@ discard_framebuffer(struct gl_context *ctx, struct gl_framebuffer *fb, if (!att) continue; + /* If we're asked to invalidate just depth or just stencil, but the + * attachment is packed depth/stencil, then we can only use + * Driver.DiscardFramebuffer if the attachments list includes both depth + * and stencil and they both point at the same renderbuffer. + */ + if ((attachments[i] == GL_DEPTH_ATTACHMENT || + attachments[i] == GL_STENCIL_ATTACHMENT) && + (!att->Renderbuffer || + att->Renderbuffer->_BaseFormat == GL_DEPTH_STENCIL)) { + GLenum other_format = (attachments[i] == GL_DEPTH_ATTACHMENT ? + GL_STENCIL_ATTACHMENT : GL_DEPTH_ATTACHMENT); + bool has_both = false; + for (int j = 0; j < numAttachments; j++) { + if (attachments[j] == other_format) + has_both = true; + break; + } + + if (fb->Attachment[BUFFER_DEPTH].Renderbuffer != + fb->Attachment[BUFFER_STENCIL].Renderbuffer || !has_both) + continue; + } + ctx->Driver.DiscardFramebuffer(ctx, fb, att); } } diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c index ee77c45d03c..efc9c11f79d 100644 --- a/src/mesa/main/get.c +++ b/src/mesa/main/get.c @@ -906,6 +906,9 @@ find_custom_value(struct gl_context *ctx, const struct value_desc *d, union valu break; /* GL_EXT_external_objects */ + case GL_NUM_DEVICE_UUIDS_EXT: + v->value_int = 1; + break; case GL_DRIVER_UUID_EXT: _mesa_get_driver_uuid(ctx, v->value_int_4); break; diff --git a/src/mesa/program/Android.mk b/src/mesa/program/Android.mk index c6470e6289e..13d0da85882 100644 --- a/src/mesa/program/Android.mk +++ b/src/mesa/program/Android.mk @@ -41,7 +41,7 @@ endef include $(MESA_TOP)/src/mesa/Makefile.sources include $(CLEAR_VARS) - +LOCAL_CFLAGS += -Wno-error LOCAL_MODULE := libmesa_program LOCAL_MODULE_CLASS := STATIC_LIBRARIES LOCAL_STATIC_LIBRARIES := libmesa_nir \ diff --git a/src/mesa/program/prog_parameter.c b/src/mesa/program/prog_parameter.c index 2bc1b6db6eb..4073030f536 100644 --- a/src/mesa/program/prog_parameter.c +++ b/src/mesa/program/prog_parameter.c @@ -271,6 +271,7 @@ _mesa_add_parameter(struct gl_program_parameter_list *paramList, p->Name = strdup(name ? name : ""); p->Type = type; p->Size = size; + p->Padded = pad_and_align; p->DataType = datatype; paramList->ParameterValueOffset[oldNum] = oldValNum; diff --git a/src/mesa/program/prog_parameter.h b/src/mesa/program/prog_parameter.h index cc551c18910..d3d5961f920 100644 --- a/src/mesa/program/prog_parameter.h +++ b/src/mesa/program/prog_parameter.h @@ -104,6 +104,12 @@ struct gl_program_parameter * A sequence of STATE_* tokens and integers to identify GL state. */ gl_state_index16 StateIndexes[STATE_LENGTH]; + + /** + * We need to keep track of whether the param is padded for use in the + * shader cache. + */ + bool Padded; }; diff --git a/src/mesa/state_tracker/st_cb_rasterpos.c b/src/mesa/state_tracker/st_cb_rasterpos.c index fecaaf77da8..c54b50dc754 100644 --- a/src/mesa/state_tracker/st_cb_rasterpos.c +++ b/src/mesa/state_tracker/st_cb_rasterpos.c @@ -208,6 +208,10 @@ new_draw_rastpos_stage(struct gl_context *ctx, struct draw_context *draw) rs->prim.end = 1; rs->prim.start = 0; rs->prim.count = 1; + rs->prim.pad = 0; + rs->prim.num_instances = 1; + rs->prim.base_instance = 0; + rs->prim.is_indirect = 0; return rs; } diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c index 1e456d019d0..92e512a0f1c 100644 --- a/src/mesa/state_tracker/st_extensions.c +++ b/src/mesa/state_tracker/st_extensions.c @@ -223,8 +223,13 @@ void st_init_limits(struct pipe_screen *screen, pc->MaxUniformComponents = MIN2(pc->MaxUniformComponents, MAX_UNIFORMS * 4); + /* For ARB programs, prog_src_register::Index is a signed 13-bit number. + * This gives us a limit of 4096 values - but we may need to generate + * internal values in addition to what the source program uses. So, we + * drop the limit one step lower, to 2048, to be safe. + */ pc->MaxParameters = - pc->MaxNativeParameters = pc->MaxUniformComponents / 4; + pc->MaxNativeParameters = MIN2(pc->MaxUniformComponents / 4, 2048); pc->MaxInputComponents = screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_INPUTS) * 4; pc->MaxOutputComponents = @@ -362,10 +367,7 @@ void st_init_limits(struct pipe_screen *screen, c->Program[MESA_SHADER_VERTEX].MaxAttribs = MIN2(c->Program[MESA_SHADER_VERTEX].MaxAttribs, 16); - /* PIPE_SHADER_CAP_MAX_INPUTS for the FS specifies the maximum number - * of inputs. It's always 2 colors + N generic inputs. */ - c->MaxVarying = screen->get_shader_param(screen, PIPE_SHADER_FRAGMENT, - PIPE_SHADER_CAP_MAX_INPUTS); + c->MaxVarying = screen->get_param(screen, PIPE_CAP_MAX_VARYINGS); c->MaxVarying = MIN2(c->MaxVarying, MAX_VARYING); c->MaxGeometryOutputVertices = screen->get_param(screen, PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES); diff --git a/src/mesa/state_tracker/st_format.c b/src/mesa/state_tracker/st_format.c index aacb8788287..febde1a5e97 100644 --- a/src/mesa/state_tracker/st_format.c +++ b/src/mesa/state_tracker/st_format.c @@ -2356,6 +2356,8 @@ st_ChooseTextureFormat(struct gl_context *ctx, GLenum target, bindings |= PIPE_BIND_DEPTH_STENCIL; else if (is_renderbuffer || internalFormat == 3 || internalFormat == 4 || internalFormat == GL_RGB || internalFormat == GL_RGBA || + internalFormat == GL_RGBA2 || + internalFormat == GL_RGB4 || internalFormat == GL_RGBA4 || internalFormat == GL_RGB8 || internalFormat == GL_RGBA8 || internalFormat == GL_BGRA || internalFormat == GL_RGB16F || diff --git a/src/mesa/state_tracker/st_glsl_to_nir.cpp b/src/mesa/state_tracker/st_glsl_to_nir.cpp index d7f2e3e6eaa..a05ec0fa586 100644 --- a/src/mesa/state_tracker/st_glsl_to_nir.cpp +++ b/src/mesa/state_tracker/st_glsl_to_nir.cpp @@ -327,7 +327,7 @@ st_nir_opts(nir_shader *nir, bool scalar) NIR_PASS(progress, nir, nir_opt_if); NIR_PASS(progress, nir, nir_opt_dead_cf); NIR_PASS(progress, nir, nir_opt_cse); - NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true); + NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true); NIR_PASS(progress, nir, nir_opt_algebraic); NIR_PASS(progress, nir, nir_opt_constant_folding); diff --git a/src/mesa/state_tracker/st_manager.c b/src/mesa/state_tracker/st_manager.c index 5efbd1fa1d2..67f1fcaf5ef 100644 --- a/src/mesa/state_tracker/st_manager.c +++ b/src/mesa/state_tracker/st_manager.c @@ -1105,10 +1105,17 @@ st_api_make_current(struct st_api *stapi, struct st_context_iface *stctxi, else { GET_CURRENT_CONTEXT(ctx); - ret = _mesa_make_current(NULL, NULL, NULL); - - if (ctx) + if (ctx) { + /* Before releasing the context, release its associated + * winsys buffers first. Then purge the context's winsys buffers list + * to free the resources of any winsys buffers that no longer have + * an existing drawable. + */ + ret = _mesa_make_current(ctx, NULL, NULL); st_framebuffers_purge(ctx->st); + } + + ret = _mesa_make_current(NULL, NULL, NULL); } return ret; diff --git a/src/mesa/state_tracker/st_tgsi_lower_yuv.c b/src/mesa/state_tracker/st_tgsi_lower_yuv.c index 6acd173adc9..73437ddda70 100644 --- a/src/mesa/state_tracker/st_tgsi_lower_yuv.c +++ b/src/mesa/state_tracker/st_tgsi_lower_yuv.c @@ -269,31 +269,39 @@ yuv_to_rgb(struct tgsi_transform_context *tctx, tctx->emit_instruction(tctx, &inst); /* DP3 dst.x, tmpA, imm[0] */ - inst = dp3_instruction(); - reg_dst(&inst.Dst[0], dst, TGSI_WRITEMASK_X); - reg_src(&inst.Src[0], &ctx->tmp[A].src, SWIZ(X, Y, Z, W)); - reg_src(&inst.Src[1], &ctx->imm[0], SWIZ(X, Y, Z, W)); - tctx->emit_instruction(tctx, &inst); + if (dst->Register.WriteMask & TGSI_WRITEMASK_X) { + inst = dp3_instruction(); + reg_dst(&inst.Dst[0], dst, TGSI_WRITEMASK_X); + reg_src(&inst.Src[0], &ctx->tmp[A].src, SWIZ(X, Y, Z, W)); + reg_src(&inst.Src[1], &ctx->imm[0], SWIZ(X, Y, Z, W)); + tctx->emit_instruction(tctx, &inst); + } /* DP3 dst.y, tmpA, imm[1] */ - inst = dp3_instruction(); - reg_dst(&inst.Dst[0], dst, TGSI_WRITEMASK_Y); - reg_src(&inst.Src[0], &ctx->tmp[A].src, SWIZ(X, Y, Z, W)); - reg_src(&inst.Src[1], &ctx->imm[1], SWIZ(X, Y, Z, W)); - tctx->emit_instruction(tctx, &inst); + if (dst->Register.WriteMask & TGSI_WRITEMASK_Y) { + inst = dp3_instruction(); + reg_dst(&inst.Dst[0], dst, TGSI_WRITEMASK_Y); + reg_src(&inst.Src[0], &ctx->tmp[A].src, SWIZ(X, Y, Z, W)); + reg_src(&inst.Src[1], &ctx->imm[1], SWIZ(X, Y, Z, W)); + tctx->emit_instruction(tctx, &inst); + } /* DP3 dst.z, tmpA, imm[2] */ - inst = dp3_instruction(); - reg_dst(&inst.Dst[0], dst, TGSI_WRITEMASK_Z); - reg_src(&inst.Src[0], &ctx->tmp[A].src, SWIZ(X, Y, Z, W)); - reg_src(&inst.Src[1], &ctx->imm[2], SWIZ(X, Y, Z, W)); - tctx->emit_instruction(tctx, &inst); + if (dst->Register.WriteMask & TGSI_WRITEMASK_Z) { + inst = dp3_instruction(); + reg_dst(&inst.Dst[0], dst, TGSI_WRITEMASK_Z); + reg_src(&inst.Src[0], &ctx->tmp[A].src, SWIZ(X, Y, Z, W)); + reg_src(&inst.Src[1], &ctx->imm[2], SWIZ(X, Y, Z, W)); + tctx->emit_instruction(tctx, &inst); + } /* MOV dst.w, imm[0].x */ - inst = mov_instruction(); - reg_dst(&inst.Dst[0], dst, TGSI_WRITEMASK_W); - reg_src(&inst.Src[0], &ctx->imm[3], SWIZ(_, _, _, W)); - tctx->emit_instruction(tctx, &inst); + if (dst->Register.WriteMask & TGSI_WRITEMASK_W) { + inst = mov_instruction(); + reg_dst(&inst.Dst[0], dst, TGSI_WRITEMASK_W); + reg_src(&inst.Src[0], &ctx->imm[3], SWIZ(_, _, _, W)); + tctx->emit_instruction(tctx, &inst); + } } static void @@ -434,7 +442,7 @@ st_tgsi_lower_yuv(const struct tgsi_token *tokens, unsigned free_slots, /* TODO better job of figuring out how many extra tokens we need.. * this is a pain about tgsi_transform :-/ */ - newlen = tgsi_num_tokens(tokens) + 120; + newlen = tgsi_num_tokens(tokens) + 300; newtoks = tgsi_alloc_tokens(newlen); if (!newtoks) return NULL; diff --git a/src/util/00-mesa-defaults.conf b/src/util/00-mesa-defaults.conf index cb0e6e659e2..c38334140b6 100644 --- a/src/util/00-mesa-defaults.conf +++ b/src/util/00-mesa-defaults.conf @@ -98,6 +98,11 @@ TODO: document the other workarounds.