diff --git a/Android.common.mk b/Android.common.mk
index 60459d16eba..8e9e10a238d 100644
--- a/Android.common.mk
+++ b/Android.common.mk
@@ -31,6 +31,7 @@ LOCAL_C_INCLUDES += \
MESA_VERSION := $(shell cat $(MESA_TOP)/VERSION)
LOCAL_CFLAGS += \
+ -O3 \
-Wno-error \
-Wno-unused-parameter \
-Wno-pointer-arith \
@@ -77,14 +78,23 @@ LOCAL_CFLAGS += \
-fvisibility=hidden \
-fno-math-errno \
-fno-trapping-math \
- -Wno-sign-compare
+ -Wno-sign-compare \
+ -Wno-self-assign \
+ -Wno-constant-logical-operand \
+ -Wno-format \
+ -Wno-incompatible-pointer-types \
+ -Wno-enum-conversion
LOCAL_CPPFLAGS += \
-D__STDC_CONSTANT_MACROS \
-D__STDC_FORMAT_MACROS \
-D__STDC_LIMIT_MACROS \
-Wno-error=non-virtual-dtor \
- -Wno-non-virtual-dtor
+ -Wno-non-virtual-dtor \
+ -Wno-delete-non-virtual-dtor \
+ -Wno-overloaded-virtual \
+ -Wno-missing-braces \
+ -Wno-deprecated-register
# mesa requires at least c99 compiler
LOCAL_CONLYFLAGS += \
diff --git a/Makefile.am b/Makefile.am
index e7e14f5b3cd..6d3c8cc19b4 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -22,6 +22,7 @@
SUBDIRS = src
AM_DISTCHECK_CONFIGURE_FLAGS = \
+ --enable-autotools \
--enable-dri \
--enable-dri3 \
--enable-egl \
diff --git a/Readme.md b/Readme.md
new file mode 100644
index 00000000000..5df295abc3a
--- /dev/null
+++ b/Readme.md
@@ -0,0 +1,2 @@
+Any security related issues should be reported by following the instructions here:
+https://01.org/security
diff --git a/VERSION b/VERSION
index 5bd94c44a5c..309a41134e2 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-19.0.0-devel
+19.0.6
diff --git a/bin/.cherry-ignore b/bin/.cherry-ignore
new file mode 100644
index 00000000000..95d704509b7
--- /dev/null
+++ b/bin/.cherry-ignore
@@ -0,0 +1,46 @@
+# Both of these were already merged with different shas
+da48cba61ef6fefb799bf96e6364b70dbf4ec712
+c812c740e60c14060eb89db66039111881a0f42f
+
+# The commit these fix was reverted from 19.0, but fixed for 19.1 due
+# to the number of fixes required to make that commit work
+8d8f80af3a17354508f2ec9d6559c915d5be351d
+0c0c69729b6d72a5297122856c8fe48510e90764
+0881e90c09965818b02e359474a6f7446b41d647
+b031c643491a92a5574c7a4bd659df33f2d89bb6
+
+# These were manually rebased by Jason, thanks!
+8ab95b849e66f3221d80a67eef2ec6e3730901a8
+5c30fffeec1732c21d600c036f95f8cdb1bb5487
+
+# This doesn't actually appliy to 19.0
+29179f58c6ba8099859ea25900214dbbd3814a92
+
+# This was superceeded by a manual backport from ken
+6981069fc805da1afc867ca3c905075d146d7ff9
+
+# This was manually backported
+0bc1942c9ddce4e796322a7561f06af5dec0decd
+
+# This doesn't need to be applied, it already seems to exist in stable.
+80dc78407d0d1e03ceddf8889b217e8fd113568d
+
+# This was backported manually
+4f18c43d1df64135e8968a7d4fbfd2c9918b76ae
+
+# These were de-nominated since they don't apply nicley
+88105375c978f9de82af8c654051e5aa16d61614
+c9358621276ae49162e58d4a16fe37abda6a347f
+
+# These are only for 19.1
+c3538ab5702ceeead284c2b5f9e700f3082c8135
+d2aa65eb1892f7b300ac24560f9dbda6b600b5a7
+78e35df52aa2f7d770f929a0866a0faa89c261a9
+0f1b070bad34c46c4bcc6c679fa533bf6b4b79e5
+ad2b4aa37806779bdfc15d704940136c3db21eb4
+9dc57eebd578b976b94c54d083377ba0920d43a8
+5820ac6756898a1bd30bde04555437a55c378726
+
+# This was manually rebased and the script doesn't understand that for some
+# reason
+cb7c9b2a9352cc73a2d3becc0427c53c8baf153a
diff --git a/bin/get-pick-list.sh b/bin/get-pick-list.sh
index 15f0e7d4a34..8fa4f438771 100755
--- a/bin/get-pick-list.sh
+++ b/bin/get-pick-list.sh
@@ -13,12 +13,12 @@
is_stable_nomination()
{
- git show --summary "$1" | grep -q -i -o "CC:.*mesa-stable"
+ git show --pretty=medium --summary "$1" | grep -q -i -o "CC:.*mesa-stable"
}
is_typod_nomination()
{
- git show --summary "$1" | grep -q -i -o "CC:.*mesa-dev"
+ git show --pretty=medium --summary "$1" | grep -q -i -o "CC:.*mesa-dev"
}
fixes=
diff --git a/bin/install_megadrivers.py b/bin/install_megadrivers.py
index d29b1911218..b5ac78887bf 100644
--- a/bin/install_megadrivers.py
+++ b/bin/install_megadrivers.py
@@ -35,7 +35,11 @@ def main():
args = parser.parse_args()
if os.path.isabs(args.libdir):
- to = os.path.join(os.environ.get('DESTDIR', '/'), args.libdir[1:])
+ destdir = os.environ.get('DESTDIR')
+ if destdir:
+ to = os.path.join(destdir, args.libdir[1:])
+ else:
+ to = args.libdir
else:
to = os.path.join(os.environ['MESON_INSTALL_DESTDIR_PREFIX'], args.libdir)
@@ -45,7 +49,6 @@ def main():
if os.path.lexists(to):
os.unlink(to)
os.makedirs(to)
- shutil.copy(args.megadriver, master)
for driver in args.drivers:
abs_driver = os.path.join(to, driver)
@@ -67,7 +70,14 @@ def main():
name, ext = os.path.splitext(name)
finally:
os.chdir(ret)
+
+ # Remove meson-created master .so and symlinks
os.unlink(master)
+ name, ext = os.path.splitext(master)
+ while ext != '.so':
+ if os.path.lexists(name):
+ os.unlink(name)
+ name, ext = os.path.splitext(name)
if __name__ == '__main__':
diff --git a/configure.ac b/configure.ac
index 858da79f4d0..b288ecbd265 100644
--- a/configure.ac
+++ b/configure.ac
@@ -122,7 +122,7 @@ LLVM_REQUIRED_OPENCL=3.9.0
LLVM_REQUIRED_R600=3.9.0
LLVM_REQUIRED_RADEONSI=7.0.0
LLVM_REQUIRED_RADV=7.0.0
-LLVM_REQUIRED_SWR=6.0.0
+LLVM_REQUIRED_SWR=7.0.0
dnl Check for progs
AC_PROG_CPP
@@ -1922,7 +1922,7 @@ if test x"$enable_dri3" = xyes; then
dri3_modifier_modules="xcb-dri3 >= $XCBDRI3_MODIFIERS_REQUIRED xcb-present >= $XCBPRESENT_MODIFIERS_REQUIRED"
PKG_CHECK_MODULES([XCB_DRI3_MODIFIERS], [$dri3_modifier_modules], [have_dri3_modifiers=yes], [have_dri3_modifiers=no])
- if test "x$have_dri3_modifiers" == xyes; then
+ if test "x$have_dri3_modifiers" = xyes; then
DEFINES="$DEFINES -DHAVE_DRI3_MODIFIERS"
fi
fi
@@ -2357,7 +2357,7 @@ if test "x$enable_xvmc" = xyes -o \
"x$enable_omx_tizonia" = xyes -o \
"x$enable_va" = xyes; then
if echo $platforms | grep -q "x11"; then
- PKG_CHECK_MODULES([VL], [x11-xcb xcb xcb-dri2 >= $XCBDRI2_REQUIRED])
+ PKG_CHECK_MODULES([VL], [x11-xcb xcb xcb-dri2 >= $XCBDRI2_REQUIRED libdrm >= $LIBDRM_REQUIRED])
fi
need_gallium_vl_winsys=yes
fi
@@ -2845,8 +2845,8 @@ if test -n "$with_gallium_drivers"; then
fi
# XXX: Keep in sync with LLVM_REQUIRED_SWR
-AM_CONDITIONAL(SWR_INVALID_LLVM_VERSION, test "x$LLVM_VERSION" != x6.0.0 -a \
- "x$LLVM_VERSION" != x6.0.1)
+AM_CONDITIONAL(SWR_INVALID_LLVM_VERSION, test "x$LLVM_VERSION" != x7.0.0 -a \
+ "x$LLVM_VERSION" != x7.0.1)
if test "x$enable_llvm" = "xyes" -a "$with_gallium_drivers"; then
llvm_require_version $LLVM_REQUIRED_GALLIUM "gallium"
@@ -2949,7 +2949,7 @@ if test "x$enable_llvm" = xyes; then
dnl the LLVM library propagated in the Libs.private of the respective .pc
dnl file which ensures complete dependency information when statically
dnl linking.
- if test "x$enable_glx" == xgallium-xlib; then
+ if test "x$enable_glx" = xgallium-xlib; then
GL_PC_LIB_PRIV="$GL_PC_LIB_PRIV $LLVM_LIBS"
fi
if test "x$enable_gallium_osmesa" = xyes; then
diff --git a/docs/envvars.html b/docs/envvars.html
index c9733e65234..43d3a6cf169 100644
--- a/docs/envvars.html
+++ b/docs/envvars.html
@@ -338,6 +338,9 @@
errors - GLSL compilation and link errors will be reported to stderr.
Example: export MESA_GLSL=dump,nopt
diff --git a/include/GL/internal/dri_interface.h b/include/GL/internal/dri_interface.h
index f2e46f65f92..6d134e3a40f 100644
--- a/include/GL/internal/dri_interface.h
+++ b/include/GL/internal/dri_interface.h
@@ -1352,6 +1352,10 @@ struct __DRIdri2ExtensionRec {
#define __DRI_IMAGE_FOURCC_YVU422 0x36315659
#define __DRI_IMAGE_FOURCC_YVU444 0x34325659
+#define __DRI_IMAGE_FOURCC_P010 0x30313050
+#define __DRI_IMAGE_FOURCC_P012 0x32313050
+#define __DRI_IMAGE_FOURCC_P016 0x36313050
+
/**
* Queryable on images created by createImageFromNames.
*
diff --git a/include/pci_ids/i965_pci_ids.h b/include/pci_ids/i965_pci_ids.h
index 7201562d824..b91abd7a3f9 100644
--- a/include/pci_ids/i965_pci_ids.h
+++ b/include/pci_ids/i965_pci_ids.h
@@ -171,6 +171,7 @@ CHIPSET(0x3185, glk_2x6, "Intel(R) UHD Graphics 600 (Geminilake 2x6)")
CHIPSET(0x3E90, cfl_gt1, "Intel(R) UHD Graphics 610 (Coffeelake 2x6 GT1)")
CHIPSET(0x3E93, cfl_gt1, "Intel(R) UHD Graphics 610 (Coffeelake 2x6 GT1)")
CHIPSET(0x3E99, cfl_gt1, "Intel(R) HD Graphics (Coffeelake 2x6 GT1)")
+CHIPSET(0x3E9C, cfl_gt1, "Intel(R) HD Graphics (Coffeelake 2x6 GT1)")
CHIPSET(0x3E91, cfl_gt2, "Intel(R) UHD Graphics 630 (Coffeelake 3x8 GT2)")
CHIPSET(0x3E92, cfl_gt2, "Intel(R) UHD Graphics 630 (Coffeelake 3x8 GT2)")
CHIPSET(0x3E96, cfl_gt2, "Intel(R) HD Graphics (Coffeelake 3x8 GT2)")
@@ -203,6 +204,10 @@ CHIPSET(0x5A54, cnl_5x8, "Intel(R) HD Graphics (Cannonlake 5x8 GT2)")
CHIPSET(0x8A50, icl_8x8, "Intel(R) HD Graphics (Ice Lake 8x8 GT2)")
CHIPSET(0x8A51, icl_8x8, "Intel(R) HD Graphics (Ice Lake 8x8 GT2)")
CHIPSET(0x8A52, icl_8x8, "Intel(R) HD Graphics (Ice Lake 8x8 GT2)")
+CHIPSET(0x8A56, icl_4x8, "Intel(R) HD Graphics (Ice Lake 4x8 GT1)")
+CHIPSET(0x8A57, icl_6x8, "Intel(R) HD Graphics (Ice Lake 6x8 GT1.5)")
+CHIPSET(0x8A58, icl_4x8, "Intel(R) HD Graphics (Ice Lake 4x8 GT1)")
+CHIPSET(0x8A59, icl_6x8, "Intel(R) HD Graphics (Ice Lake 6x8 GT1.5)")
CHIPSET(0x8A5A, icl_6x8, "Intel(R) HD Graphics (Ice Lake 6x8 GT1.5)")
CHIPSET(0x8A5B, icl_4x8, "Intel(R) HD Graphics (Ice Lake 4x8 GT1)")
CHIPSET(0x8A5C, icl_6x8, "Intel(R) HD Graphics (Ice Lake 6x8 GT1.5)")
diff --git a/meson.build b/meson.build
index d975b0dbf4b..5286b91c615 100644
--- a/meson.build
+++ b/meson.build
@@ -61,11 +61,11 @@ endif
dri_drivers_path = get_option('dri-drivers-path')
if dri_drivers_path == ''
- dri_drivers_path = join_paths(get_option('libdir'), 'dri')
+ dri_drivers_path = join_paths(get_option('prefix'), get_option('libdir'), 'dri')
endif
dri_search_path = get_option('dri-search-path')
if dri_search_path == ''
- dri_search_path = join_paths(get_option('prefix'), dri_drivers_path)
+ dri_search_path = dri_drivers_path
endif
with_gles1 = get_option('gles1')
@@ -608,7 +608,7 @@ with_gallium_xa = _xa != 'false'
d3d_drivers_path = get_option('d3d-drivers-path')
if d3d_drivers_path == ''
- d3d_drivers_path = join_paths(get_option('libdir'), 'd3d')
+ d3d_drivers_path = join_paths(get_option('prefix'), get_option('libdir'), 'd3d')
endif
with_gallium_st_nine = get_option('gallium-nine')
@@ -1213,6 +1213,7 @@ if _llvm != 'false'
with_gallium_opencl or _llvm == 'true'
),
static : not _shared_llvm,
+ method : 'config-tool',
)
with_llvm = dep_llvm.found()
endif
@@ -1387,12 +1388,14 @@ if with_platform_x11
dep_xshmfence = dependency('xshmfence', version : '>= 1.1')
endif
endif
- if with_glx == 'dri'
+ if with_glx == 'dri' or with_glx == 'gallium-xlib'
+ dep_glproto = dependency('glproto', version : '>= 1.4.14')
+ endif
+ if with_glx == 'dri'
if with_dri_platform == 'drm'
dep_dri2proto = dependency('dri2proto', version : '>= 2.8')
dep_xxf86vm = dependency('xxf86vm')
endif
- dep_glproto = dependency('glproto', version : '>= 1.4.14')
endif
if (with_egl or (
with_gallium_vdpau or with_gallium_xvmc or with_gallium_xa or
@@ -1400,7 +1403,7 @@ if with_platform_x11
dep_xcb_xfixes = dependency('xcb-xfixes')
endif
if with_xlib_lease
- dep_xcb_xrandr = dependency('xcb-randr', version : '>= 1.12')
+ dep_xcb_xrandr = dependency('xcb-randr')
dep_xlib_xrandr = dependency('xrandr', version : '>= 1.3')
endif
endif
diff --git a/scons/custom.py b/scons/custom.py
index 09946fa7324..8028990ef61 100644
--- a/scons/custom.py
+++ b/scons/custom.py
@@ -48,7 +48,12 @@
# a path directly. We want to support both, so we need to detect the SCons version,
# for which no API is provided by SCons 8-P
-scons_version = tuple(map(int, SCons.__version__.split('.')))
+# Scons version string has consistently been in this format:
+# MajorVersion.MinorVersion.Patch[.alpha/beta.yyyymmdd]
+# so this formula should cover all versions regardless of type
+# stable, alpha or beta.
+# For simplicity alpha and beta flags are removed.
+scons_version = tuple(map(int, SCons.__version__.split('.')[:3]))
def quietCommandLines(env):
# Quiet command lines
diff --git a/scons/gallium.py b/scons/gallium.py
index 963834a5fbc..61bbeb2399f 100755
--- a/scons/gallium.py
+++ b/scons/gallium.py
@@ -308,7 +308,20 @@ def generate(env):
if env.GetOption('num_jobs') <= 1:
env.SetOption('num_jobs', num_jobs())
- env.Decider('MD5-timestamp')
+ # Speed up dependency checking. See
+ # - https://github.com/SCons/scons/wiki/GoFastButton
+ # - https://bugs.freedesktop.org/show_bug.cgi?id=109443
+
+ # Scons version string has consistently been in this format:
+ # MajorVersion.MinorVersion.Patch[.alpha/beta.yyyymmdd]
+ # so this formula should cover all versions regardless of type
+ # stable, alpha or beta.
+ # For simplicity alpha and beta flags are removed.
+
+ scons_version = distutils.version.StrictVersion('.'.join(SCons.__version__.split('.')[:3]))
+ if scons_version < distutils.version.StrictVersion('3.0.2') or \
+ scons_version > distutils.version.StrictVersion('3.0.4'):
+ env.Decider('MD5-timestamp')
env.SetOption('max_drift', 60)
# C preprocessor options
diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c
index fc8c6a09d2f..7ba13c24953 100644
--- a/src/amd/common/ac_gpu_info.c
+++ b/src/amd/common/ac_gpu_info.c
@@ -367,9 +367,7 @@ bool ac_query_gpu_info(int fd, amdgpu_device_handle dev,
info->has_syncobj_wait_for_submit = info->has_syncobj && info->drm_minor >= 20;
info->has_fence_to_handle = info->has_syncobj && info->drm_minor >= 21;
info->has_ctx_priority = info->drm_minor >= 22;
- /* TODO: Enable this once the kernel handles it efficiently. */
- info->has_local_buffers = info->drm_minor >= 20 &&
- !info->has_dedicated_vram;
+ info->has_local_buffers = info->drm_minor >= 20;
info->kernel_flushes_hdp_before_ib = true;
info->htile_cmask_support_1d_tiling = true;
info->si_TA_CS_BC_BASE_ADDR_allowed = true;
diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 768364b2dc6..9ad279241ad 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -905,6 +905,37 @@ ac_build_fs_interp(struct ac_llvm_context *ctx,
ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
}
+LLVMValueRef
+ac_build_fs_interp_f16(struct ac_llvm_context *ctx,
+ LLVMValueRef llvm_chan,
+ LLVMValueRef attr_number,
+ LLVMValueRef params,
+ LLVMValueRef i,
+ LLVMValueRef j)
+{
+ LLVMValueRef args[6];
+ LLVMValueRef p1;
+
+ args[0] = i;
+ args[1] = llvm_chan;
+ args[2] = attr_number;
+ args[3] = ctx->i1false;
+ args[4] = params;
+
+ p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16",
+ ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
+
+ args[0] = p1;
+ args[1] = j;
+ args[2] = llvm_chan;
+ args[3] = attr_number;
+ args[4] = ctx->i1false;
+ args[5] = params;
+
+ return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16",
+ ctx->f16, args, 6, AC_FUNC_ATTR_READNONE);
+}
+
LLVMValueRef
ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
LLVMValueRef parameter,
@@ -923,6 +954,14 @@ ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
}
+LLVMValueRef
+ac_build_gep_ptr(struct ac_llvm_context *ctx,
+ LLVMValueRef base_ptr,
+ LLVMValueRef index)
+{
+ return LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
+}
+
LLVMValueRef
ac_build_gep0(struct ac_llvm_context *ctx,
LLVMValueRef base_ptr,
@@ -3416,7 +3455,7 @@ ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
/* ws->result_reduce is already the correct value */
if (ws->enable_inclusive)
- ws->result_inclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->src, ws->op);
+ ws->result_inclusive = ac_build_alu_op(ctx, ws->result_inclusive, ws->src, ws->op);
if (ws->enable_exclusive)
ws->result_exclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->extra, ws->op);
}
diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h
index e47893bbbe6..370e7e9741c 100644
--- a/src/amd/common/ac_llvm_build.h
+++ b/src/amd/common/ac_llvm_build.h
@@ -216,6 +216,14 @@ ac_build_fs_interp(struct ac_llvm_context *ctx,
LLVMValueRef i,
LLVMValueRef j);
+LLVMValueRef
+ac_build_fs_interp_f16(struct ac_llvm_context *ctx,
+ LLVMValueRef llvm_chan,
+ LLVMValueRef attr_number,
+ LLVMValueRef params,
+ LLVMValueRef i,
+ LLVMValueRef j);
+
LLVMValueRef
ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
LLVMValueRef parameter,
@@ -223,6 +231,11 @@ ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
LLVMValueRef attr_number,
LLVMValueRef params);
+LLVMValueRef
+ac_build_gep_ptr(struct ac_llvm_context *ctx,
+ LLVMValueRef base_ptr,
+ LLVMValueRef index);
+
LLVMValueRef
ac_build_gep0(struct ac_llvm_context *ctx,
LLVMValueRef base_ptr,
diff --git a/src/amd/common/ac_llvm_util.c b/src/amd/common/ac_llvm_util.c
index 69446863b95..6063411310b 100644
--- a/src/amd/common/ac_llvm_util.c
+++ b/src/amd/common/ac_llvm_util.c
@@ -151,13 +151,14 @@ static LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family,
LLVMTargetRef target = ac_get_llvm_target(triple);
snprintf(features, sizeof(features),
- "+DumpCode,-fp32-denormals,+fp64-denormals%s%s%s%s%s",
+ "+DumpCode,-fp32-denormals,+fp64-denormals%s%s%s%s%s%s",
HAVE_LLVM >= 0x0800 ? "" : ",+vgpr-spilling",
tm_options & AC_TM_SISCHED ? ",+si-scheduler" : "",
tm_options & AC_TM_FORCE_ENABLE_XNACK ? ",+xnack" : "",
tm_options & AC_TM_FORCE_DISABLE_XNACK ? ",-xnack" : "",
- tm_options & AC_TM_PROMOTE_ALLOCA_TO_SCRATCH ? ",-promote-alloca" : "");
-
+ tm_options & AC_TM_PROMOTE_ALLOCA_TO_SCRATCH ? ",-promote-alloca" : "",
+ tm_options & AC_TM_NO_LOAD_STORE_OPT ? ",-load-store-opt" : "");
+
LLVMTargetMachineRef tm = LLVMCreateTargetMachine(
target,
triple,
diff --git a/src/amd/common/ac_llvm_util.h b/src/amd/common/ac_llvm_util.h
index 6d961c06f8a..ca00540da80 100644
--- a/src/amd/common/ac_llvm_util.h
+++ b/src/amd/common/ac_llvm_util.h
@@ -65,6 +65,7 @@ enum ac_target_machine_options {
AC_TM_CHECK_IR = (1 << 5),
AC_TM_ENABLE_GLOBAL_ISEL = (1 << 6),
AC_TM_CREATE_LOW_OPT = (1 << 7),
+ AC_TM_NO_LOAD_STORE_OPT = (1 << 8),
};
enum ac_float_mode {
diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index efd3e260af1..a0815995b12 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -1019,10 +1019,17 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
LLVMValueRef in[3];
for (unsigned chan = 0; chan < 3; chan++)
in[chan] = ac_llvm_extract_elem(&ctx->ac, src[0], chan);
- results[0] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubetc",
+ results[0] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubesc",
ctx->ac.f32, in, 3, AC_FUNC_ATTR_READNONE);
- results[1] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubesc",
+ results[1] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubetc",
ctx->ac.f32, in, 3, AC_FUNC_ATTR_READNONE);
+ LLVMValueRef ma = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubema",
+ ctx->ac.f32, in, 3, AC_FUNC_ATTR_READNONE);
+ results[0] = ac_build_fdiv(&ctx->ac, results[0], ma);
+ results[1] = ac_build_fdiv(&ctx->ac, results[1], ma);
+ LLVMValueRef offset = LLVMConstReal(ctx->ac.f32, 0.5);
+ results[0] = LLVMBuildFAdd(ctx->ac.builder, results[0], offset, "");
+ results[1] = LLVMBuildFAdd(ctx->ac.builder, results[1], offset, "");
result = ac_build_gather_values(&ctx->ac, results, 2);
break;
}
@@ -1896,14 +1903,18 @@ static LLVMValueRef visit_load_var(struct ac_nir_context *ctx,
if (var) {
bool vs_in = ctx->stage == MESA_SHADER_VERTEX &&
var->data.mode == nir_var_shader_in;
- if (var->data.compact)
- stride = 1;
idx = var->data.driver_location;
comp = var->data.location_frac;
mode = var->data.mode;
get_deref_offset(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), vs_in, NULL, NULL,
&const_index, &indir_index);
+
+ if (var->data.compact) {
+ stride = 1;
+ const_index += comp;
+ comp = 0;
+ }
}
if (instr->dest.ssa.bit_size == 64)
@@ -2006,18 +2017,28 @@ static void
visit_store_var(struct ac_nir_context *ctx,
nir_intrinsic_instr *instr)
{
- nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
+ nir_deref_instr *deref = nir_instr_as_deref(instr->src[0].ssa->parent_instr);
+ nir_variable *var = nir_deref_instr_get_variable(deref);
LLVMValueRef temp_ptr, value;
- int idx = var->data.driver_location;
- unsigned comp = var->data.location_frac;
+ int idx = 0;
+ unsigned comp = 0;
LLVMValueRef src = ac_to_float(&ctx->ac, get_src(ctx, instr->src[1]));
int writemask = instr->const_index[0];
LLVMValueRef indir_index;
unsigned const_index;
- get_deref_offset(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), false,
- NULL, NULL, &const_index, &indir_index);
+ if (var) {
+ get_deref_offset(ctx, deref, false,
+ NULL, NULL, &const_index, &indir_index);
+ idx = var->data.driver_location;
+ comp = var->data.location_frac;
+
+ if (var->data.compact) {
+ const_index += comp;
+ comp = 0;
+ }
+ }
if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src)) == 64) {
@@ -2030,7 +2051,7 @@ visit_store_var(struct ac_nir_context *ctx,
writemask = writemask << comp;
- switch (var->data.mode) {
+ switch (deref->mode) {
case nir_var_shader_out:
if (ctx->stage == MESA_SHADER_TESS_CTRL) {
@@ -2039,8 +2060,8 @@ visit_store_var(struct ac_nir_context *ctx,
unsigned const_index = 0;
const bool is_patch = var->data.patch;
- get_deref_offset(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
- false, NULL, is_patch ? NULL : &vertex_index,
+ get_deref_offset(ctx, deref, false, NULL,
+ is_patch ? NULL : &vertex_index,
&const_index, &indir_index);
ctx->abi->store_tcs_outputs(ctx->abi, var,
@@ -2107,7 +2128,7 @@ visit_store_var(struct ac_nir_context *ctx,
int writemask = instr->const_index[0];
LLVMValueRef address = get_src(ctx, instr->src[0]);
LLVMValueRef val = get_src(ctx, instr->src[1]);
- if (util_is_power_of_two_nonzero(writemask)) {
+ if (writemask == (1u << ac_get_llvm_num_components(val)) - 1) {
val = LLVMBuildBitCast(
ctx->ac.builder, val,
LLVMGetElementType(LLVMTypeOf(address)), "");
@@ -2338,10 +2359,12 @@ static void get_image_coords(struct ac_nir_context *ctx,
}
static LLVMValueRef get_image_buffer_descriptor(struct ac_nir_context *ctx,
- const nir_intrinsic_instr *instr, bool write)
+ const nir_intrinsic_instr *instr,
+ bool write, bool atomic)
{
LLVMValueRef rsrc = get_image_descriptor(ctx, instr, AC_DESC_BUFFER, write);
- if (ctx->abi->gfx9_stride_size_workaround) {
+ if (ctx->abi->gfx9_stride_size_workaround ||
+ (ctx->abi->gfx9_stride_size_workaround_for_atomic && atomic)) {
LLVMValueRef elem_count = LLVMBuildExtractElement(ctx->ac.builder, rsrc, LLVMConstInt(ctx->ac.i32, 2, 0), "");
LLVMValueRef stride = LLVMBuildExtractElement(ctx->ac.builder, rsrc, LLVMConstInt(ctx->ac.i32, 1, 0), "");
stride = LLVMBuildLShr(ctx->ac.builder, stride, LLVMConstInt(ctx->ac.i32, 16, 0), "");
@@ -2374,7 +2397,7 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx,
unsigned num_channels = util_last_bit(mask);
LLVMValueRef rsrc, vindex;
- rsrc = get_image_buffer_descriptor(ctx, instr, false);
+ rsrc = get_image_buffer_descriptor(ctx, instr, false, false);
vindex = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]),
ctx->ac.i32_0, "");
@@ -2418,7 +2441,7 @@ static void visit_image_store(struct ac_nir_context *ctx,
if (dim == GLSL_SAMPLER_DIM_BUF) {
char name[48];
const char *types[] = { "f32", "v2f32", "v4f32" };
- LLVMValueRef rsrc = get_image_buffer_descriptor(ctx, instr, true);
+ LLVMValueRef rsrc = get_image_buffer_descriptor(ctx, instr, true, false);
LLVMValueRef src = ac_to_float(&ctx->ac, get_src(ctx, instr->src[3]));
unsigned src_channels = ac_get_llvm_num_components(src);
@@ -2514,11 +2537,14 @@ static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx,
params[param_count++] = get_src(ctx, instr->src[3]);
if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) {
- params[param_count++] = get_image_buffer_descriptor(ctx, instr, true);
+ params[param_count++] = get_image_buffer_descriptor(ctx, instr, true, true);
params[param_count++] = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]),
ctx->ac.i32_0, ""); /* vindex */
params[param_count++] = ctx->ac.i32_0; /* voffset */
- if (HAVE_LLVM >= 0x800) {
+ if (HAVE_LLVM >= 0x900) {
+ /* XXX: The new raw/struct atomic intrinsics are buggy
+ * with LLVM 8, see r358579.
+ */
params[param_count++] = ctx->ac.i32_0; /* soffset */
params[param_count++] = ctx->ac.i32_0; /* slc */
@@ -3079,7 +3105,8 @@ static void visit_intrinsic(struct ac_nir_context *ctx,
ctx->abi->frag_pos[2],
ac_build_fdiv(&ctx->ac, ctx->ac.f32_1, ctx->abi->frag_pos[3])
};
- result = ac_build_gather_values(&ctx->ac, values, 4);
+ result = ac_to_integer(&ctx->ac,
+ ac_build_gather_values(&ctx->ac, values, 4));
break;
}
case nir_intrinsic_load_front_face:
@@ -3818,6 +3845,73 @@ static void visit_jump(struct ac_llvm_context *ctx,
}
}
+static LLVMTypeRef
+glsl_base_to_llvm_type(struct ac_llvm_context *ac,
+ enum glsl_base_type type)
+{
+ switch (type) {
+ case GLSL_TYPE_INT:
+ case GLSL_TYPE_UINT:
+ case GLSL_TYPE_BOOL:
+ case GLSL_TYPE_SUBROUTINE:
+ return ac->i32;
+ case GLSL_TYPE_INT16:
+ case GLSL_TYPE_UINT16:
+ return ac->i16;
+ case GLSL_TYPE_FLOAT:
+ return ac->f32;
+ case GLSL_TYPE_FLOAT16:
+ return ac->f16;
+ case GLSL_TYPE_INT64:
+ case GLSL_TYPE_UINT64:
+ return ac->i64;
+ case GLSL_TYPE_DOUBLE:
+ return ac->f64;
+ default:
+ unreachable("unknown GLSL type");
+ }
+}
+
+static LLVMTypeRef
+glsl_to_llvm_type(struct ac_llvm_context *ac,
+ const struct glsl_type *type)
+{
+ if (glsl_type_is_scalar(type)) {
+ return glsl_base_to_llvm_type(ac, glsl_get_base_type(type));
+ }
+
+ if (glsl_type_is_vector(type)) {
+ return LLVMVectorType(
+ glsl_base_to_llvm_type(ac, glsl_get_base_type(type)),
+ glsl_get_vector_elements(type));
+ }
+
+ if (glsl_type_is_matrix(type)) {
+ return LLVMArrayType(
+ glsl_to_llvm_type(ac, glsl_get_column_type(type)),
+ glsl_get_matrix_columns(type));
+ }
+
+ if (glsl_type_is_array(type)) {
+ return LLVMArrayType(
+ glsl_to_llvm_type(ac, glsl_get_array_element(type)),
+ glsl_get_length(type));
+ }
+
+ assert(glsl_type_is_struct(type));
+
+ LLVMTypeRef member_types[glsl_get_length(type)];
+
+ for (unsigned i = 0; i < glsl_get_length(type); i++) {
+ member_types[i] =
+ glsl_to_llvm_type(ac,
+ glsl_get_struct_field(type, i));
+ }
+
+ return LLVMStructTypeInContext(ac->context, member_types,
+ glsl_get_length(type), false);
+}
+
static void visit_deref(struct ac_nir_context *ctx,
nir_deref_instr *instr)
{
@@ -3839,9 +3933,27 @@ static void visit_deref(struct ac_nir_context *ctx,
result = ac_build_gep0(&ctx->ac, get_src(ctx, instr->parent),
get_src(ctx, instr->arr.index));
break;
- case nir_deref_type_cast:
+ case nir_deref_type_ptr_as_array:
+ result = ac_build_gep_ptr(&ctx->ac, get_src(ctx, instr->parent),
+ get_src(ctx, instr->arr.index));
+ break;
+ case nir_deref_type_cast: {
result = get_src(ctx, instr->parent);
+
+ LLVMTypeRef pointee_type = glsl_to_llvm_type(&ctx->ac, instr->type);
+ LLVMTypeRef type = LLVMPointerType(pointee_type, AC_ADDR_SPACE_LDS);
+
+ if (LLVMTypeOf(result) != type) {
+ if (LLVMGetTypeKind(LLVMTypeOf(result)) == LLVMVectorTypeKind) {
+ result = LLVMBuildBitCast(ctx->ac.builder, result,
+ type, "");
+ } else {
+ result = LLVMBuildIntToPtr(ctx->ac.builder, result,
+ type, "");
+ }
+ }
break;
+ }
default:
unreachable("Unhandled deref_instr deref type");
}
@@ -3990,73 +4102,6 @@ ac_handle_shader_output_decl(struct ac_llvm_context *ctx,
}
}
-static LLVMTypeRef
-glsl_base_to_llvm_type(struct ac_llvm_context *ac,
- enum glsl_base_type type)
-{
- switch (type) {
- case GLSL_TYPE_INT:
- case GLSL_TYPE_UINT:
- case GLSL_TYPE_BOOL:
- case GLSL_TYPE_SUBROUTINE:
- return ac->i32;
- case GLSL_TYPE_INT16:
- case GLSL_TYPE_UINT16:
- return ac->i16;
- case GLSL_TYPE_FLOAT:
- return ac->f32;
- case GLSL_TYPE_FLOAT16:
- return ac->f16;
- case GLSL_TYPE_INT64:
- case GLSL_TYPE_UINT64:
- return ac->i64;
- case GLSL_TYPE_DOUBLE:
- return ac->f64;
- default:
- unreachable("unknown GLSL type");
- }
-}
-
-static LLVMTypeRef
-glsl_to_llvm_type(struct ac_llvm_context *ac,
- const struct glsl_type *type)
-{
- if (glsl_type_is_scalar(type)) {
- return glsl_base_to_llvm_type(ac, glsl_get_base_type(type));
- }
-
- if (glsl_type_is_vector(type)) {
- return LLVMVectorType(
- glsl_base_to_llvm_type(ac, glsl_get_base_type(type)),
- glsl_get_vector_elements(type));
- }
-
- if (glsl_type_is_matrix(type)) {
- return LLVMArrayType(
- glsl_to_llvm_type(ac, glsl_get_column_type(type)),
- glsl_get_matrix_columns(type));
- }
-
- if (glsl_type_is_array(type)) {
- return LLVMArrayType(
- glsl_to_llvm_type(ac, glsl_get_array_element(type)),
- glsl_get_length(type));
- }
-
- assert(glsl_type_is_struct(type));
-
- LLVMTypeRef member_types[glsl_get_length(type)];
-
- for (unsigned i = 0; i < glsl_get_length(type); i++) {
- member_types[i] =
- glsl_to_llvm_type(ac,
- glsl_get_struct_field(type, i));
- }
-
- return LLVMStructTypeInContext(ac->context, member_types,
- glsl_get_length(type), false);
-}
-
static void
setup_locals(struct ac_nir_context *ctx,
struct nir_function *func)
diff --git a/src/amd/common/ac_shader_abi.h b/src/amd/common/ac_shader_abi.h
index ee18e6c1923..9eb4d37257e 100644
--- a/src/amd/common/ac_shader_abi.h
+++ b/src/amd/common/ac_shader_abi.h
@@ -195,6 +195,7 @@ struct ac_shader_abi {
/* Whether to workaround GFX9 ignoring the stride for the buffer size if IDXEN=0
* and LLVM optimizes an indexed load with constant index to IDXEN=0. */
bool gfx9_stride_size_workaround;
+ bool gfx9_stride_size_workaround_for_atomic;
};
#endif /* AC_SHADER_ABI_H */
diff --git a/src/amd/vulkan/meson.build b/src/amd/vulkan/meson.build
index 06c33ca45dc..ba43bee3cd9 100644
--- a/src/amd/vulkan/meson.build
+++ b/src/amd/vulkan/meson.build
@@ -132,17 +132,15 @@ libvulkan_radeon = shared_library(
'vulkan_radeon',
[libradv_files, radv_entrypoints, radv_extensions_c, vk_format_table_c, sha1_h],
include_directories : [
- inc_common, inc_amd, inc_amd_common, inc_compiler, inc_vulkan_util,
- inc_vulkan_wsi,
+ inc_common, inc_amd, inc_amd_common, inc_compiler, inc_vulkan_wsi,
],
link_with : [
- libamd_common, libamdgpu_addrlib, libvulkan_util, libvulkan_wsi,
- libmesa_util,
+ libamd_common, libamdgpu_addrlib, libvulkan_wsi, libmesa_util,
],
dependencies : [
dep_llvm, dep_libdrm_amdgpu, dep_thread, dep_elf, dep_dl, dep_m,
dep_valgrind, radv_deps,
- idep_nir,
+ idep_nir, idep_vulkan_util,
],
c_args : [c_vis_args, no_override_init_args, radv_flags],
cpp_args : [cpp_vis_args, radv_flags],
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index 7f7f052986e..ab0016d0277 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -301,7 +301,6 @@ radv_cmd_buffer_destroy(struct radv_cmd_buffer *cmd_buffer)
static VkResult
radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
{
-
cmd_buffer->device->ws->cs_reset(cmd_buffer->cs);
list_for_each_entry_safe(struct radv_cmd_buffer_upload, up,
@@ -326,6 +325,8 @@ radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
cmd_buffer->record_result = VK_SUCCESS;
+ memset(cmd_buffer->vertex_bindings, 0, sizeof(cmd_buffer->vertex_bindings));
+
for (unsigned i = 0; i < VK_PIPELINE_BIND_POINT_RANGE_SIZE; i++) {
cmd_buffer->descriptors[i].dirty = 0;
cmd_buffer->descriptors[i].valid = 0;
@@ -338,14 +339,15 @@ radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
unsigned fence_offset, eop_bug_offset;
void *fence_ptr;
- radv_cmd_buffer_upload_alloc(cmd_buffer, 8, 0, &fence_offset,
+ radv_cmd_buffer_upload_alloc(cmd_buffer, 8, 8, &fence_offset,
&fence_ptr);
+
cmd_buffer->gfx9_fence_va =
radv_buffer_get_va(cmd_buffer->upload.upload_bo);
cmd_buffer->gfx9_fence_va += fence_offset;
/* Allocate a buffer for the EOP bug on GFX9. */
- radv_cmd_buffer_upload_alloc(cmd_buffer, 16 * num_db, 0,
+ radv_cmd_buffer_upload_alloc(cmd_buffer, 16 * num_db, 8,
&eop_bug_offset, &fence_ptr);
cmd_buffer->gfx9_eop_bug_va =
radv_buffer_get_va(cmd_buffer->upload.upload_bo);
@@ -416,6 +418,8 @@ radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer *cmd_buffer,
unsigned *out_offset,
void **ptr)
{
+ assert(util_is_power_of_two_nonzero(alignment));
+
uint64_t offset = align(cmd_buffer->upload.offset, alignment);
if (offset + size > cmd_buffer->upload.size) {
if (!radv_cmd_buffer_resize_upload_buf(cmd_buffer, size))
@@ -1255,7 +1259,7 @@ radv_set_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
++reg_count;
- radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + reg_count, 0));
+ radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + reg_count, cmd_buffer->state.predicating));
radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
S_370_WR_CONFIRM(1) |
S_370_ENGINE_SEL(V_370_PFP));
@@ -1279,7 +1283,7 @@ radv_set_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer,
uint64_t va = radv_buffer_get_va(image->bo);
va += image->offset + image->tc_compat_zrange_offset;
- radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
+ radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, cmd_buffer->state.predicating));
radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
S_370_WR_CONFIRM(1) |
S_370_ENGINE_SEL(V_370_PFP));
@@ -1356,7 +1360,7 @@ radv_load_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
uint32_t reg = R_028028_DB_STENCIL_CLEAR + 4 * reg_offset;
- if (cmd_buffer->device->physical_device->rad_info.chip_class >= VI) {
+ if (cmd_buffer->device->physical_device->has_load_ctx_reg_pkt) {
radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG, 3, 0));
radeon_emit(cs, va);
radeon_emit(cs, va >> 32);
@@ -1473,7 +1477,7 @@ radv_set_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
assert(radv_image_has_cmask(image) || radv_image_has_dcc(image));
- radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 4, 0));
+ radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 4, cmd_buffer->state.predicating));
radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
S_370_WR_CONFIRM(1) |
S_370_ENGINE_SEL(V_370_PFP));
@@ -1518,14 +1522,13 @@ radv_load_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
uint32_t reg = R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c;
- if (cmd_buffer->device->physical_device->rad_info.chip_class >= VI) {
+ if (cmd_buffer->device->physical_device->has_load_ctx_reg_pkt) {
radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG, 3, cmd_buffer->state.predicating));
radeon_emit(cs, va);
radeon_emit(cs, va >> 32);
radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
radeon_emit(cs, 2);
} else {
- /* TODO: Figure out how to use LOAD_CONTEXT_REG on SI/CIK. */
radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
COPY_DATA_DST_SEL(COPY_DATA_REG) |
@@ -2155,6 +2158,7 @@ radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer,
ia_multi_vgt_param =
si_get_ia_multi_vgt_param(cmd_buffer, draw_info->instance_count > 1,
draw_info->indirect,
+ !!draw_info->strmout_buffer,
draw_info->indirect ? 0 : draw_info->count);
if (state->last_ia_multi_vgt_param != ia_multi_vgt_param) {
@@ -4404,10 +4408,15 @@ static void radv_handle_depth_image_transition(struct radv_cmd_buffer *cmd_buffe
if (!radv_image_has_htile(image))
return;
- if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED &&
- radv_layout_has_htile(image, dst_layout, dst_queue_mask)) {
- /* TODO: merge with the clear if applicable */
- radv_initialize_htile(cmd_buffer, image, range, 0);
+ if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
+ uint32_t clear_value = vk_format_is_stencil(image->vk_format) ? 0xfffff30f : 0xfffc000f;
+
+ if (radv_layout_is_htile_compressed(image, dst_layout,
+ dst_queue_mask)) {
+ clear_value = 0;
+ }
+
+ radv_initialize_htile(cmd_buffer, image, range, clear_value);
} else if (!radv_layout_is_htile_compressed(image, src_layout, src_queue_mask) &&
radv_layout_is_htile_compressed(image, dst_layout, dst_queue_mask)) {
uint32_t clear_value = vk_format_is_stencil(image->vk_format) ? 0xfffff30f : 0xfffc000f;
@@ -4727,7 +4736,7 @@ static void write_event(struct radv_cmd_buffer *cmd_buffer,
radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo);
- MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 18);
+ MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 21);
/* Flags that only require a top-of-pipe event. */
VkPipelineStageFlags top_of_pipe_flags =
@@ -4837,8 +4846,11 @@ void radv_CmdBeginConditionalRenderingEXT(
{
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
RADV_FROM_HANDLE(radv_buffer, buffer, pConditionalRenderingBegin->buffer);
+ struct radeon_cmdbuf *cs = cmd_buffer->cs;
bool draw_visible = true;
- uint64_t va;
+ uint64_t pred_value = 0;
+ uint64_t va, new_va;
+ unsigned pred_offset;
va = radv_buffer_get_va(buffer->bo) + pConditionalRenderingBegin->offset;
@@ -4854,13 +4866,51 @@ void radv_CmdBeginConditionalRenderingEXT(
si_emit_cache_flush(cmd_buffer);
+ /* From the Vulkan spec 1.1.107:
+ *
+ * "If the 32-bit value at offset in buffer memory is zero, then the
+ * rendering commands are discarded, otherwise they are executed as
+ * normal. If the value of the predicate in buffer memory changes while
+ * conditional rendering is active, the rendering commands may be
+ * discarded in an implementation-dependent way. Some implementations
+ * may latch the value of the predicate upon beginning conditional
+ * rendering while others may read it before every rendering command."
+ *
+ * But, the AMD hardware treats the predicate as a 64-bit value which
+ * means we need a workaround in the driver. Luckily, it's not required
+ * to support if the value changes when predication is active.
+ *
+ * The workaround is as follows:
+ * 1) allocate a 64-value in the upload BO and initialize it to 0
+ * 2) copy the 32-bit predicate value to the upload BO
+ * 3) use the new allocated VA address for predication
+ *
+ * Based on the conditionalrender demo, it's faster to do the COPY_DATA
+ * in ME (+ sync PFP) instead of PFP.
+ */
+ radv_cmd_buffer_upload_data(cmd_buffer, 8, 16, &pred_value, &pred_offset);
+
+ new_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset;
+
+ radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
+ radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
+ COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
+ COPY_DATA_WR_CONFIRM);
+ radeon_emit(cs, va);
+ radeon_emit(cs, va >> 32);
+ radeon_emit(cs, new_va);
+ radeon_emit(cs, new_va >> 32);
+
+ radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
+ radeon_emit(cs, 0);
+
/* Enable predication for this command buffer. */
- si_emit_set_predication_state(cmd_buffer, draw_visible, va);
+ si_emit_set_predication_state(cmd_buffer, draw_visible, new_va);
cmd_buffer->state.predicating = true;
/* Store conditional rendering user info. */
cmd_buffer->state.predication_type = draw_visible;
- cmd_buffer->state.predication_va = va;
+ cmd_buffer->state.predication_va = new_va;
}
void radv_CmdEndConditionalRenderingEXT(
@@ -4904,7 +4954,7 @@ void radv_CmdBindTransformFeedbackBuffersEXT(
enabled_mask |= 1 << idx;
}
- cmd_buffer->state.streamout.enabled_mask = enabled_mask;
+ cmd_buffer->state.streamout.enabled_mask |= enabled_mask;
cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_BUFFER;
}
diff --git a/src/amd/vulkan/radv_debug.h b/src/amd/vulkan/radv_debug.h
index 01712bd22ce..ac93434b8bd 100644
--- a/src/amd/vulkan/radv_debug.h
+++ b/src/amd/vulkan/radv_debug.h
@@ -51,6 +51,7 @@ enum {
RADV_DEBUG_CHECKIR = 0x200000,
RADV_DEBUG_NOTHREADLLVM = 0x400000,
RADV_DEBUG_NOBINNING = 0x800000,
+ RADV_DEBUG_NO_LOAD_STORE_OPT = 0x1000000,
};
enum {
diff --git a/src/amd/vulkan/radv_descriptor_set.c b/src/amd/vulkan/radv_descriptor_set.c
index cebe06aa078..68171b5d244 100644
--- a/src/amd/vulkan/radv_descriptor_set.c
+++ b/src/amd/vulkan/radv_descriptor_set.c
@@ -84,7 +84,9 @@ VkResult radv_CreateDescriptorSetLayout(
uint32_t immutable_sampler_count = 0;
for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) {
max_binding = MAX2(max_binding, pCreateInfo->pBindings[j].binding);
- if (pCreateInfo->pBindings[j].pImmutableSamplers)
+ if ((pCreateInfo->pBindings[j].descriptorType == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER ||
+ pCreateInfo->pBindings[j].descriptorType == VK_DESCRIPTOR_TYPE_SAMPLER) &&
+ pCreateInfo->pBindings[j].pImmutableSamplers)
immutable_sampler_count += pCreateInfo->pBindings[j].descriptorCount;
}
@@ -182,7 +184,9 @@ VkResult radv_CreateDescriptorSetLayout(
set_layout->has_variable_descriptors = true;
}
- if (binding->pImmutableSamplers) {
+ if ((binding->descriptorType == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER ||
+ binding->descriptorType == VK_DESCRIPTOR_TYPE_SAMPLER) &&
+ binding->pImmutableSamplers) {
set_layout->binding[b].immutable_samplers_offset = samplers_offset;
set_layout->binding[b].immutable_samplers_equal =
has_equal_immutable_samplers(binding->pImmutableSamplers, binding->descriptorCount);
@@ -965,9 +969,11 @@ void radv_update_descriptor_sets(
}
src_ptr += src_binding_layout->size / 4;
dst_ptr += dst_binding_layout->size / 4;
- dst_buffer_list[j] = src_buffer_list[j];
- ++src_buffer_list;
- ++dst_buffer_list;
+
+ if (src_binding_layout->type != VK_DESCRIPTOR_TYPE_SAMPLER) {
+ /* Sampler descriptors don't have a buffer list. */
+ dst_buffer_list[j] = src_buffer_list[j];
+ }
}
}
}
diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index 34d93b262f8..334c8bd4548 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -111,6 +111,7 @@ radv_get_device_name(enum radeon_family family, char *name, size_t name_len)
case CHIP_VEGAM: chip_string = "AMD RADV VEGA M"; break;
case CHIP_VEGA10: chip_string = "AMD RADV VEGA10"; break;
case CHIP_VEGA12: chip_string = "AMD RADV VEGA12"; break;
+ case CHIP_VEGA20: chip_string = "AMD RADV VEGA20"; break;
case CHIP_RAVEN: chip_string = "AMD RADV RAVEN"; break;
case CHIP_RAVEN2: chip_string = "AMD RADV RAVEN2"; break;
default: chip_string = "AMD RADV unknown"; break;
@@ -337,7 +338,7 @@ radv_physical_device_init(struct radv_physical_device *device,
device->rad_info.chip_class > GFX9)
fprintf(stderr, "WARNING: radv is not a conformant vulkan implementation, testing use only.\n");
- radv_get_driver_uuid(&device->device_uuid);
+ radv_get_driver_uuid(&device->driver_uuid);
radv_get_device_uuid(&device->rad_info, &device->device_uuid);
if (device->rad_info.family == CHIP_STONEY ||
@@ -369,6 +370,11 @@ radv_physical_device_init(struct radv_physical_device *device,
device->dcc_msaa_allowed =
(device->instance->perftest_flags & RADV_PERFTEST_DCC_MSAA);
+ /* TODO: Figure out how to use LOAD_CONTEXT_REG on SI/CIK. */
+ device->has_load_ctx_reg_pkt = device->rad_info.chip_class >= GFX9 ||
+ (device->rad_info.chip_class >= VI &&
+ device->rad_info.me_fw_feature >= 41);
+
radv_physical_device_init_mem_types(device);
radv_fill_device_extension_table(device, &device->supported_extensions);
@@ -460,6 +466,7 @@ static const struct debug_control radv_debug_options[] = {
{"checkir", RADV_DEBUG_CHECKIR},
{"nothreadllvm", RADV_DEBUG_NOTHREADLLVM},
{"nobinning", RADV_DEBUG_NOBINNING},
+ {"noloadstoreopt", RADV_DEBUG_NO_LOAD_STORE_OPT},
{NULL, 0}
};
@@ -505,6 +512,13 @@ radv_handle_per_app_options(struct radv_instance *instance,
} else if (!strcmp(name, "DOOM_VFR")) {
/* Work around a Doom VFR game bug */
instance->debug_flags |= RADV_DEBUG_NO_DYNAMIC_BOUNDS;
+ } else if (!strcmp(name, "MonsterHunterWorld.exe")) {
+ /* Workaround for a WaW hazard when LLVM moves/merges
+ * load/store memory operations.
+ * See https://reviews.llvm.org/D61313
+ */
+ if (HAVE_LLVM < 0x900)
+ instance->debug_flags |= RADV_DEBUG_NO_LOAD_STORE_OPT;
}
}
@@ -734,8 +748,7 @@ void radv_GetPhysicalDeviceFeatures(
.alphaToOne = true,
.multiViewport = true,
.samplerAnisotropy = true,
- .textureCompressionETC2 = pdevice->rad_info.chip_class >= GFX9 ||
- pdevice->rad_info.family == CHIP_STONEY,
+ .textureCompressionETC2 = radv_device_supports_etc(pdevice),
.textureCompressionASTC_LDR = false,
.textureCompressionBC = true,
.occlusionQueryPrecise = true,
@@ -802,7 +815,7 @@ void radv_GetPhysicalDeviceFeatures2(
features->storageBuffer16BitAccess = enabled;
features->uniformAndStorageBuffer16BitAccess = enabled;
features->storagePushConstant16 = enabled;
- features->storageInputOutput16 = enabled;
+ features->storageInputOutput16 = enabled && HAVE_LLVM >= 0x900;
break;
}
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_YCBCR_CONVERSION_FEATURES: {
@@ -998,7 +1011,7 @@ void radv_GetPhysicalDeviceProperties(
.maxCullDistances = 8,
.maxCombinedClipAndCullDistances = 8,
.discreteQueuePriorities = 2,
- .pointSizeRange = { 0.125, 255.875 },
+ .pointSizeRange = { 0.0, 8192.0 },
.lineWidthRange = { 0.0, 7.9921875 },
.pointSizeGranularity = (1.0 / 8.0),
.lineWidthGranularity = (1.0 / 128.0),
@@ -2790,7 +2803,7 @@ VkResult radv_QueueSubmit(
struct radeon_winsys_fence *base_fence = fence ? fence->fence : NULL;
struct radeon_winsys_ctx *ctx = queue->hw_ctx;
int ret;
- uint32_t max_cs_submission = queue->device->trace_bo ? 1 : UINT32_MAX;
+ uint32_t max_cs_submission = queue->device->trace_bo ? 1 : RADV_MAX_IBS_PER_SUBMIT;
uint32_t scratch_size = 0;
uint32_t compute_scratch_size = 0;
uint32_t esgs_ring_size = 0, gsvs_ring_size = 0;
diff --git a/src/amd/vulkan/radv_extensions.py b/src/amd/vulkan/radv_extensions.py
index 1bf56943f25..187c0ba574d 100644
--- a/src/amd/vulkan/radv_extensions.py
+++ b/src/amd/vulkan/radv_extensions.py
@@ -100,7 +100,7 @@ def __init__(self, name, ext_version, enable):
Extension('VK_EXT_display_control', 1, 'VK_USE_PLATFORM_DISPLAY_KHR'),
Extension('VK_EXT_debug_report', 9, True),
Extension('VK_EXT_depth_range_unrestricted', 1, True),
- Extension('VK_EXT_descriptor_indexing', 2, True),
+ Extension('VK_EXT_descriptor_indexing', 2, False),
Extension('VK_EXT_discard_rectangles', 1, True),
Extension('VK_EXT_external_memory_dma_buf', 1, True),
Extension('VK_EXT_external_memory_host', 1, 'device->rad_info.has_userptr'),
diff --git a/src/amd/vulkan/radv_formats.c b/src/amd/vulkan/radv_formats.c
index 499d94befeb..9c61e769ebd 100644
--- a/src/amd/vulkan/radv_formats.c
+++ b/src/amd/vulkan/radv_formats.c
@@ -595,6 +595,14 @@ static bool radv_is_filter_minmax_format_supported(VkFormat format)
}
}
+bool
+radv_device_supports_etc(struct radv_physical_device *physical_device)
+{
+ return physical_device->rad_info.family == CHIP_VEGA10 ||
+ physical_device->rad_info.family == CHIP_RAVEN ||
+ physical_device->rad_info.family == CHIP_STONEY;
+}
+
static void
radv_physical_device_get_format_properties(struct radv_physical_device *physical_device,
VkFormat format,
@@ -612,9 +620,7 @@ radv_physical_device_get_format_properties(struct radv_physical_device *physical
}
if (desc->layout == VK_FORMAT_LAYOUT_ETC &&
- physical_device->rad_info.family != CHIP_VEGA10 &&
- physical_device->rad_info.family != CHIP_RAVEN &&
- physical_device->rad_info.family != CHIP_STONEY) {
+ !radv_device_supports_etc(physical_device)) {
out_properties->linearTilingFeatures = linear;
out_properties->optimalTilingFeatures = tiled;
out_properties->bufferFeatures = buffer;
@@ -984,10 +990,22 @@ bool radv_format_pack_clear_color(VkFormat format,
assert(channel->size == 8);
v = util_format_linear_float_to_srgb_8unorm(value->float32[c]);
- } else if (channel->type == VK_FORMAT_TYPE_UNSIGNED) {
- v = MAX2(MIN2(value->float32[c], 1.0f), 0.0f) * ((1ULL << channel->size) - 1);
- } else {
- v = MAX2(MIN2(value->float32[c], 1.0f), -1.0f) * ((1ULL << (channel->size - 1)) - 1);
+ } else {
+ float f = MIN2(value->float32[c], 1.0f);
+
+ if (channel->type == VK_FORMAT_TYPE_UNSIGNED) {
+ f = MAX2(f, 0.0f) * ((1ULL << channel->size) - 1);
+ } else {
+ f = MAX2(f, -1.0f) * ((1ULL << (channel->size - 1)) - 1);
+ }
+
+ /* The hardware rounds before conversion. */
+ if (f > 0)
+ f += 0.5f;
+ else
+ f -= 0.5f;
+
+ v = (uint64_t)f;
}
} else if (channel->type == VK_FORMAT_TYPE_FLOAT) {
if (channel->size == 32) {
diff --git a/src/amd/vulkan/radv_meta_blit.c b/src/amd/vulkan/radv_meta_blit.c
index ef690edb471..f3a8f6464b8 100644
--- a/src/amd/vulkan/radv_meta_blit.c
+++ b/src/amd/vulkan/radv_meta_blit.c
@@ -849,54 +849,60 @@ build_pipeline(struct radv_device *device,
.subpass = 0,
};
- switch(aspect) {
- case VK_IMAGE_ASPECT_COLOR_BIT:
- vk_pipeline_info.pColorBlendState = &(VkPipelineColorBlendStateCreateInfo) {
- .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
- .attachmentCount = 1,
- .pAttachments = (VkPipelineColorBlendAttachmentState []) {
- { .colorWriteMask =
- VK_COLOR_COMPONENT_A_BIT |
- VK_COLOR_COMPONENT_R_BIT |
- VK_COLOR_COMPONENT_G_BIT |
- VK_COLOR_COMPONENT_B_BIT },
+ VkPipelineColorBlendStateCreateInfo color_blend_info = {
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
+ .attachmentCount = 1,
+ .pAttachments = (VkPipelineColorBlendAttachmentState []) {
+ {
+ .colorWriteMask = VK_COLOR_COMPONENT_A_BIT |
+ VK_COLOR_COMPONENT_R_BIT |
+ VK_COLOR_COMPONENT_G_BIT |
+ VK_COLOR_COMPONENT_B_BIT },
}
};
+
+ VkPipelineDepthStencilStateCreateInfo depth_info = {
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
+ .depthTestEnable = true,
+ .depthWriteEnable = true,
+ .depthCompareOp = VK_COMPARE_OP_ALWAYS,
+ };
+
+ VkPipelineDepthStencilStateCreateInfo stencil_info = {
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
+ .depthTestEnable = false,
+ .depthWriteEnable = false,
+ .stencilTestEnable = true,
+ .front = {
+ .failOp = VK_STENCIL_OP_REPLACE,
+ .passOp = VK_STENCIL_OP_REPLACE,
+ .depthFailOp = VK_STENCIL_OP_REPLACE,
+ .compareOp = VK_COMPARE_OP_ALWAYS,
+ .compareMask = 0xff,
+ .writeMask = 0xff,
+ .reference = 0
+ },
+ .back = {
+ .failOp = VK_STENCIL_OP_REPLACE,
+ .passOp = VK_STENCIL_OP_REPLACE,
+ .depthFailOp = VK_STENCIL_OP_REPLACE,
+ .compareOp = VK_COMPARE_OP_ALWAYS,
+ .compareMask = 0xff,
+ .writeMask = 0xff,
+ .reference = 0
+ },
+ .depthCompareOp = VK_COMPARE_OP_ALWAYS,
+ };
+
+ switch(aspect) {
+ case VK_IMAGE_ASPECT_COLOR_BIT:
+ vk_pipeline_info.pColorBlendState = &color_blend_info;
break;
case VK_IMAGE_ASPECT_DEPTH_BIT:
- vk_pipeline_info.pDepthStencilState = &(VkPipelineDepthStencilStateCreateInfo) {
- .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
- .depthTestEnable = true,
- .depthWriteEnable = true,
- .depthCompareOp = VK_COMPARE_OP_ALWAYS,
- };
+ vk_pipeline_info.pDepthStencilState = &depth_info;
break;
case VK_IMAGE_ASPECT_STENCIL_BIT:
- vk_pipeline_info.pDepthStencilState = &(VkPipelineDepthStencilStateCreateInfo) {
- .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
- .depthTestEnable = false,
- .depthWriteEnable = false,
- .stencilTestEnable = true,
- .front = {
- .failOp = VK_STENCIL_OP_REPLACE,
- .passOp = VK_STENCIL_OP_REPLACE,
- .depthFailOp = VK_STENCIL_OP_REPLACE,
- .compareOp = VK_COMPARE_OP_ALWAYS,
- .compareMask = 0xff,
- .writeMask = 0xff,
- .reference = 0
- },
- .back = {
- .failOp = VK_STENCIL_OP_REPLACE,
- .passOp = VK_STENCIL_OP_REPLACE,
- .depthFailOp = VK_STENCIL_OP_REPLACE,
- .compareOp = VK_COMPARE_OP_ALWAYS,
- .compareMask = 0xff,
- .writeMask = 0xff,
- .reference = 0
- },
- .depthCompareOp = VK_COMPARE_OP_ALWAYS,
- };
+ vk_pipeline_info.pDepthStencilState = &stencil_info;
break;
default:
unreachable("Unhandled aspect");
diff --git a/src/amd/vulkan/radv_meta_clear.c b/src/amd/vulkan/radv_meta_clear.c
index 8805f0435e1..051dcbc7c76 100644
--- a/src/amd/vulkan/radv_meta_clear.c
+++ b/src/amd/vulkan/radv_meta_clear.c
@@ -370,14 +370,29 @@ emit_color_clear(struct radv_cmd_buffer *cmd_buffer,
const struct radv_framebuffer *fb = cmd_buffer->state.framebuffer;
const uint32_t subpass_att = clear_att->colorAttachment;
const uint32_t pass_att = subpass->color_attachments[subpass_att].attachment;
- const struct radv_image_view *iview = fb->attachments[pass_att].attachment;
- const uint32_t samples = iview->image->info.samples;
- const uint32_t samples_log2 = ffs(samples) - 1;
- unsigned fs_key = radv_format_meta_fs_key(iview->vk_format);
+ const struct radv_image_view *iview = fb ? fb->attachments[pass_att].attachment : NULL;
+ uint32_t samples, samples_log2;
+ VkFormat format;
+ unsigned fs_key;
VkClearColorValue clear_value = clear_att->clearValue.color;
VkCommandBuffer cmd_buffer_h = radv_cmd_buffer_to_handle(cmd_buffer);
VkPipeline pipeline;
+ /* When a framebuffer is bound to the current command buffer, get the
+ * number of samples from it. Otherwise, get the number of samples from
+ * the render pass because it's likely a secondary command buffer.
+ */
+ if (iview) {
+ samples = iview->image->info.samples;
+ format = iview->vk_format;
+ } else {
+ samples = cmd_buffer->state.pass->attachments[pass_att].samples;
+ format = cmd_buffer->state.pass->attachments[pass_att].format;
+ }
+
+ samples_log2 = ffs(samples) - 1;
+ fs_key = radv_format_meta_fs_key(format);
+
if (fs_key == -1) {
radv_finishme("color clears incomplete");
return;
@@ -617,6 +632,9 @@ static bool depth_view_can_fast_clear(struct radv_cmd_buffer *cmd_buffer,
const VkClearRect *clear_rect,
VkClearDepthStencilValue clear_value)
{
+ if (!iview)
+ return false;
+
uint32_t queue_mask = radv_image_queue_family_mask(iview->image,
cmd_buffer->queue_family_index,
cmd_buffer->queue_family_index);
@@ -632,8 +650,9 @@ static bool depth_view_can_fast_clear(struct radv_cmd_buffer *cmd_buffer,
if (radv_image_has_htile(iview->image) &&
iview->base_mip == 0 &&
iview->base_layer == 0 &&
+ iview->layer_count == iview->image->info.array_size &&
radv_layout_is_htile_compressed(iview->image, layout, queue_mask) &&
- !radv_image_extent_compare(iview->image, &iview->extent))
+ radv_image_extent_compare(iview->image, &iview->extent))
return true;
return false;
}
@@ -705,11 +724,22 @@ emit_depthstencil_clear(struct radv_cmd_buffer *cmd_buffer,
const uint32_t pass_att = subpass->depth_stencil_attachment.attachment;
VkClearDepthStencilValue clear_value = clear_att->clearValue.depthStencil;
VkImageAspectFlags aspects = clear_att->aspectMask;
- const struct radv_image_view *iview = fb->attachments[pass_att].attachment;
- const uint32_t samples = iview->image->info.samples;
- const uint32_t samples_log2 = ffs(samples) - 1;
+ const struct radv_image_view *iview = fb ? fb->attachments[pass_att].attachment : NULL;
+ uint32_t samples, samples_log2;
VkCommandBuffer cmd_buffer_h = radv_cmd_buffer_to_handle(cmd_buffer);
+ /* When a framebuffer is bound to the current command buffer, get the
+ * number of samples from it. Otherwise, get the number of samples from
+ * the render pass because it's likely a secondary command buffer.
+ */
+ if (iview) {
+ samples = iview->image->info.samples;
+ } else {
+ samples = cmd_buffer->state.pass->attachments[pass_att].samples;
+ }
+
+ samples_log2 = ffs(samples) - 1;
+
assert(pass_att != VK_ATTACHMENT_UNUSED);
if (!(aspects & VK_IMAGE_ASPECT_DEPTH_BIT))
@@ -915,7 +945,11 @@ static bool
radv_image_view_can_fast_clear(struct radv_device *device,
const struct radv_image_view *iview)
{
- struct radv_image *image = iview->image;
+ struct radv_image *image;
+
+ if (!iview)
+ return false;
+ image = iview->image;
/* Only fast clear if the image itself can be fast cleared. */
if (!radv_image_can_fast_clear(device, image))
@@ -1523,7 +1557,7 @@ emit_clear(struct radv_cmd_buffer *cmd_buffer,
const uint32_t subpass_att = clear_att->colorAttachment;
const uint32_t pass_att = subpass->color_attachments[subpass_att].attachment;
VkImageLayout image_layout = subpass->color_attachments[subpass_att].layout;
- const struct radv_image_view *iview = fb->attachments[pass_att].attachment;
+ const struct radv_image_view *iview = fb ? fb->attachments[pass_att].attachment : NULL;
VkClearColorValue clear_value = clear_att->clearValue.color;
if (radv_can_fast_clear_color(cmd_buffer, iview, image_layout,
@@ -1536,8 +1570,11 @@ emit_clear(struct radv_cmd_buffer *cmd_buffer,
}
} else {
const uint32_t pass_att = subpass->depth_stencil_attachment.attachment;
+ if (pass_att == VK_ATTACHMENT_UNUSED)
+ return;
+
VkImageLayout image_layout = subpass->depth_stencil_attachment.layout;
- const struct radv_image_view *iview = fb->attachments[pass_att].attachment;
+ const struct radv_image_view *iview = fb ? fb->attachments[pass_att].attachment : NULL;
VkClearDepthStencilValue clear_value = clear_att->clearValue.depthStencil;
assert(aspects & (VK_IMAGE_ASPECT_DEPTH_BIT |
diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c
index e80938527e5..00d65de8164 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -92,6 +92,7 @@ struct radv_shader_context {
gl_shader_stage stage;
LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS * 4];
+ uint64_t float16_shaded_mask;
uint64_t input_mask;
uint64_t output_mask;
@@ -1441,7 +1442,7 @@ store_tcs_output(struct ac_shader_abi *abi,
{
struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
const unsigned location = var->data.location;
- const unsigned component = var->data.location_frac;
+ unsigned component = var->data.location_frac;
const bool is_patch = var->data.patch;
const bool is_compact = var->data.compact;
LLVMValueRef dw_addr;
@@ -1459,10 +1460,14 @@ store_tcs_output(struct ac_shader_abi *abi,
}
param = shader_io_get_unique_index(location);
- if (location == VARYING_SLOT_CLIP_DIST0 &&
- is_compact && const_index > 3) {
- const_index -= 3;
- param++;
+ if ((location == VARYING_SLOT_CLIP_DIST0 || location == VARYING_SLOT_CLIP_DIST1) && is_compact) {
+ const_index += component;
+ component = 0;
+
+ if (const_index >= 4) {
+ const_index -= 4;
+ param++;
+ }
}
if (!is_patch) {
@@ -1529,9 +1534,13 @@ load_tes_input(struct ac_shader_abi *abi,
LLVMValueRef result;
unsigned param = shader_io_get_unique_index(location);
- if (location == VARYING_SLOT_CLIP_DIST0 && is_compact && const_index > 3) {
- const_index -= 3;
- param++;
+ if ((location == VARYING_SLOT_CLIP_DIST0 || location == VARYING_SLOT_CLIP_DIST1) && is_compact) {
+ const_index += component;
+ component = 0;
+ if (const_index >= 4) {
+ const_index -= 4;
+ param++;
+ }
}
buf_addr = get_tcs_tes_buffer_address_params(ctx, param, const_index,
@@ -2018,10 +2027,32 @@ handle_vs_input_decl(struct radv_shader_context *ctx,
t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);
- input = ac_build_buffer_load_format(&ctx->ac, t_list,
- buffer_index,
- ctx->ac.i32_0,
- num_channels, false, true);
+ if (ctx->options->key.vs.vertex_attribute_provided & (1u << attrib_index)) {
+ input = ac_build_buffer_load_format(&ctx->ac, t_list,
+ buffer_index,
+ ctx->ac.i32_0,
+ num_channels, false, true);
+ } else {
+ /* Per the Vulkan spec, it's invalid to consume vertex
+ * attributes that are not provided by the pipeline but
+ * some (invalid) apps appear to do that. Fill the
+ * input array with (eg. (0, 0, 0, 1)) to workaround
+ * the problem and to avoid possible GPU hangs.
+ */
+ LLVMValueRef chan[4];
+
+ /* The input_usage mask might be 0 if input variables
+ * are not removed by the compiler.
+ */
+ num_channels = CLAMP(num_channels, 1, 4);
+
+ for (unsigned i = 0; i < num_channels; i++) {
+ chan[i] = i == 3 ? ctx->ac.f32_1 : ctx->ac.f32_0;
+ chan[i] = ac_to_float(&ctx->ac, chan[i]);
+ }
+
+ input = ac_build_gather_values(&ctx->ac, chan, num_channels);
+ }
input = ac_build_expand_to_vec4(&ctx->ac, input, num_channels);
@@ -2051,6 +2082,7 @@ static void interp_fs_input(struct radv_shader_context *ctx,
unsigned attr,
LLVMValueRef interp_param,
LLVMValueRef prim_mask,
+ bool float16,
LLVMValueRef result[4])
{
LLVMValueRef attr_number;
@@ -2083,7 +2115,12 @@ static void interp_fs_input(struct radv_shader_context *ctx,
for (chan = 0; chan < 4; chan++) {
LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false);
- if (interp) {
+ if (interp && float16) {
+ result[chan] = ac_build_fs_interp_f16(&ctx->ac,
+ llvm_chan,
+ attr_number,
+ prim_mask, i, j);
+ } else if (interp) {
result[chan] = ac_build_fs_interp(&ctx->ac,
llvm_chan,
attr_number,
@@ -2095,7 +2132,30 @@ static void interp_fs_input(struct radv_shader_context *ctx,
attr_number,
prim_mask);
result[chan] = LLVMBuildBitCast(ctx->ac.builder, result[chan], ctx->ac.i32, "");
- result[chan] = LLVMBuildTruncOrBitCast(ctx->ac.builder, result[chan], LLVMTypeOf(interp_param), "");
+ result[chan] = LLVMBuildTruncOrBitCast(ctx->ac.builder, result[chan], float16 ? ctx->ac.i16 : ctx->ac.i32, "");
+ }
+ }
+}
+
+static void mark_16bit_fs_input(struct radv_shader_context *ctx,
+ const struct glsl_type *type,
+ int location)
+{
+ if (glsl_type_is_scalar(type) || glsl_type_is_vector(type) || glsl_type_is_matrix(type)) {
+ unsigned attrib_count = glsl_count_attribute_slots(type, false);
+ if (glsl_type_is_16bit(type)) {
+ ctx->float16_shaded_mask |= ((1ull << attrib_count) - 1) << location;
+ }
+ } else if (glsl_type_is_array(type)) {
+ unsigned stride = glsl_count_attribute_slots(glsl_get_array_element(type), false);
+ for (unsigned i = 0; i < glsl_get_length(type); ++i) {
+ mark_16bit_fs_input(ctx, glsl_get_array_element(type), location + i * stride);
+ }
+ } else {
+ assert(glsl_type_is_struct(type));
+ for (unsigned i = 0; i < glsl_get_length(type); i++) {
+ mark_16bit_fs_input(ctx, glsl_get_struct_field(type, i), location);
+ location += glsl_count_attribute_slots(glsl_get_struct_field(type, i), false);
}
}
}
@@ -2110,9 +2170,20 @@ handle_fs_input_decl(struct radv_shader_context *ctx,
uint64_t mask;
variable->data.driver_location = idx * 4;
+
+
+ if (variable->data.compact) {
+ unsigned component_count = variable->data.location_frac +
+ glsl_get_length(variable->type);
+ attrib_count = (component_count + 3) / 4;
+ } else
+ mark_16bit_fs_input(ctx, variable->type, idx);
+
mask = ((1ull << attrib_count) - 1) << variable->data.location;
- if (glsl_get_base_type(glsl_without_array(variable->type)) == GLSL_TYPE_FLOAT) {
+ if (glsl_get_base_type(glsl_without_array(variable->type)) == GLSL_TYPE_FLOAT ||
+ glsl_get_base_type(glsl_without_array(variable->type)) == GLSL_TYPE_FLOAT16 ||
+ glsl_get_base_type(glsl_without_array(variable->type)) == GLSL_TYPE_STRUCT) {
unsigned interp_type;
if (variable->data.sample)
interp_type = INTERP_SAMPLE;
@@ -2123,22 +2194,12 @@ handle_fs_input_decl(struct radv_shader_context *ctx,
interp = lookup_interp_param(&ctx->abi, variable->data.interpolation, interp_type);
}
- bool is_16bit = glsl_type_is_16bit(glsl_without_array(variable->type));
- LLVMTypeRef type = is_16bit ? ctx->ac.i16 : ctx->ac.i32;
if (interp == NULL)
- interp = LLVMGetUndef(type);
+ interp = LLVMGetUndef(ctx->ac.i32);
for (unsigned i = 0; i < attrib_count; ++i)
ctx->inputs[ac_llvm_reg_index_soa(idx + i, 0)] = interp;
- if (idx == VARYING_SLOT_CLIP_DIST0) {
- /* Do not account for the number of components inside the array
- * of clip/cull distances because this might wrongly set other
- * bits like primitive ID or layer.
- */
- mask = 1ull << VARYING_SLOT_CLIP_DIST0;
- }
-
ctx->input_mask |= mask;
}
@@ -2200,11 +2261,14 @@ handle_fs_inputs(struct radv_shader_context *ctx,
if (i >= VARYING_SLOT_VAR0 || i == VARYING_SLOT_PNTC ||
i == VARYING_SLOT_PRIMITIVE_ID || i == VARYING_SLOT_LAYER) {
interp_param = *inputs;
- interp_fs_input(ctx, index, interp_param, ctx->abi.prim_mask,
+ bool float16 = (ctx->float16_shaded_mask >> i) & 1;
+ interp_fs_input(ctx, index, interp_param, ctx->abi.prim_mask, float16,
inputs);
if (LLVMIsUndef(interp_param))
ctx->shader_info->fs.flat_shaded_mask |= 1u << index;
+ if (float16)
+ ctx->shader_info->fs.float16_shaded_mask |= 1u << index;
if (i >= VARYING_SLOT_VAR0)
ctx->abi.fs_input_attr_indices[i - VARYING_SLOT_VAR0] = index;
++index;
@@ -2216,7 +2280,7 @@ handle_fs_inputs(struct radv_shader_context *ctx,
interp_param = *inputs;
interp_fs_input(ctx, index, interp_param,
- ctx->abi.prim_mask, inputs);
+ ctx->abi.prim_mask, false, inputs);
++index;
}
} else if (i == VARYING_SLOT_POS) {
@@ -2250,6 +2314,12 @@ scan_shader_output_decl(struct radv_shader_context *ctx,
if (stage == MESA_SHADER_TESS_CTRL)
return;
+ if (variable->data.compact) {
+ unsigned component_count = variable->data.location_frac +
+ glsl_get_length(variable->type);
+ attrib_count = (component_count + 3) / 4;
+ }
+
mask_attribs = ((1ull << attrib_count) - 1) << idx;
if (stage == MESA_SHADER_VERTEX ||
stage == MESA_SHADER_TESS_EVAL ||
@@ -2265,8 +2335,6 @@ scan_shader_output_decl(struct radv_shader_context *ctx,
ctx->shader_info->tes.outinfo.cull_dist_mask = (1 << shader->info.cull_distance_array_size) - 1;
ctx->shader_info->tes.outinfo.cull_dist_mask <<= shader->info.clip_distance_array_size;
}
-
- mask_attribs = 1ull << idx;
}
}
@@ -2365,7 +2433,7 @@ si_llvm_init_export_args(struct radv_shader_context *ctx,
if (is_16bit) {
for (unsigned chan = 0; chan < 4; chan++)
values[chan] = LLVMBuildZExt(ctx->ac.builder,
- values[chan],
+ ac_to_integer(&ctx->ac, values[chan]),
ctx->ac.i32, "");
}
break;
@@ -2376,7 +2444,7 @@ si_llvm_init_export_args(struct radv_shader_context *ctx,
if (is_16bit) {
for (unsigned chan = 0; chan < 4; chan++)
values[chan] = LLVMBuildSExt(ctx->ac.builder,
- values[chan],
+ ac_to_integer(&ctx->ac, values[chan]),
ctx->ac.i32, "");
}
break;
@@ -2429,12 +2497,8 @@ si_llvm_init_export_args(struct radv_shader_context *ctx,
} else
memcpy(&args->out[0], values, sizeof(values[0]) * 4);
- for (unsigned i = 0; i < 4; ++i) {
- if (!(args->enabled_channels & (1 << i)))
- continue;
-
+ for (unsigned i = 0; i < 4; ++i)
args->out[i] = ac_to_float(&ctx->ac, args->out[i]);
- }
}
static void
@@ -2615,51 +2679,41 @@ handle_vs_outputs_post(struct radv_shader_context *ctx,
memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
sizeof(outinfo->vs_output_param_offset));
- if (ctx->output_mask & (1ull << VARYING_SLOT_CLIP_DIST0)) {
- unsigned output_usage_mask, length;
- LLVMValueRef slots[8];
- unsigned j;
-
- if (ctx->stage == MESA_SHADER_VERTEX &&
- !ctx->is_gs_copy_shader) {
- output_usage_mask =
- ctx->shader_info->info.vs.output_usage_mask[VARYING_SLOT_CLIP_DIST0];
- } else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
- output_usage_mask =
- ctx->shader_info->info.tes.output_usage_mask[VARYING_SLOT_CLIP_DIST0];
- } else {
- assert(ctx->is_gs_copy_shader);
- output_usage_mask =
- ctx->shader_info->info.gs.output_usage_mask[VARYING_SLOT_CLIP_DIST0];
- }
+ for(unsigned location = VARYING_SLOT_CLIP_DIST0; location <= VARYING_SLOT_CLIP_DIST1; ++location) {
+ if (ctx->output_mask & (1ull << location)) {
+ unsigned output_usage_mask, length;
+ LLVMValueRef slots[4];
+ unsigned j;
+
+ if (ctx->stage == MESA_SHADER_VERTEX &&
+ !ctx->is_gs_copy_shader) {
+ output_usage_mask =
+ ctx->shader_info->info.vs.output_usage_mask[location];
+ } else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
+ output_usage_mask =
+ ctx->shader_info->info.tes.output_usage_mask[location];
+ } else {
+ assert(ctx->is_gs_copy_shader);
+ output_usage_mask =
+ ctx->shader_info->info.gs.output_usage_mask[location];
+ }
- length = util_last_bit(output_usage_mask);
+ length = util_last_bit(output_usage_mask);
- i = VARYING_SLOT_CLIP_DIST0;
- for (j = 0; j < length; j++)
- slots[j] = ac_to_float(&ctx->ac, radv_load_output(ctx, i, j));
+ for (j = 0; j < length; j++)
+ slots[j] = ac_to_float(&ctx->ac, radv_load_output(ctx, location, j));
- for (i = length; i < 8; i++)
- slots[i] = LLVMGetUndef(ctx->ac.f32);
+ for (i = length; i < 4; i++)
+ slots[i] = LLVMGetUndef(ctx->ac.f32);
- if (length > 4) {
- target = V_008DFC_SQ_EXP_POS + 3;
- si_llvm_init_export_args(ctx, &slots[4], 0xf, target, &args);
+ target = V_008DFC_SQ_EXP_POS + 2 + (location - VARYING_SLOT_CLIP_DIST0);
+ si_llvm_init_export_args(ctx, &slots[0], 0xf, target, &args);
memcpy(&pos_args[target - V_008DFC_SQ_EXP_POS],
- &args, sizeof(args));
- }
+ &args, sizeof(args));
- target = V_008DFC_SQ_EXP_POS + 2;
- si_llvm_init_export_args(ctx, &slots[0], 0xf, target, &args);
- memcpy(&pos_args[target - V_008DFC_SQ_EXP_POS],
- &args, sizeof(args));
-
- /* Export the clip/cull distances values to the next stage. */
- radv_export_param(ctx, param_count, &slots[0], 0xf);
- outinfo->vs_output_param_offset[VARYING_SLOT_CLIP_DIST0] = param_count++;
- if (length > 4) {
- radv_export_param(ctx, param_count, &slots[4], 0xf);
- outinfo->vs_output_param_offset[VARYING_SLOT_CLIP_DIST1] = param_count++;
+ /* Export the clip/cull distances values to the next stage. */
+ radv_export_param(ctx, param_count, &slots[0], 0xf);
+ outinfo->vs_output_param_offset[location] = param_count++;
}
}
@@ -2820,28 +2874,14 @@ handle_es_outputs_post(struct radv_shader_context *ctx,
LLVMValueRef lds_base = NULL;
for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
- unsigned output_usage_mask;
int param_index;
- int length = 4;
if (!(ctx->output_mask & (1ull << i)))
continue;
- if (ctx->stage == MESA_SHADER_VERTEX) {
- output_usage_mask =
- ctx->shader_info->info.vs.output_usage_mask[i];
- } else {
- assert(ctx->stage == MESA_SHADER_TESS_EVAL);
- output_usage_mask =
- ctx->shader_info->info.tes.output_usage_mask[i];
- }
-
- if (i == VARYING_SLOT_CLIP_DIST0)
- length = util_last_bit(output_usage_mask);
-
param_index = shader_io_get_unique_index(i);
- max_output_written = MAX2(param_index + (length > 4), max_output_written);
+ max_output_written = MAX2(param_index, max_output_written);
}
outinfo->esgs_itemsize = (max_output_written + 1) * 16;
@@ -2862,7 +2902,6 @@ handle_es_outputs_post(struct radv_shader_context *ctx,
LLVMValueRef *out_ptr = &ctx->abi.outputs[i * 4];
unsigned output_usage_mask;
int param_index;
- int length = 4;
if (!(ctx->output_mask & (1ull << i)))
continue;
@@ -2876,9 +2915,6 @@ handle_es_outputs_post(struct radv_shader_context *ctx,
ctx->shader_info->info.tes.output_usage_mask[i];
}
- if (i == VARYING_SLOT_CLIP_DIST0)
- length = util_last_bit(output_usage_mask);
-
param_index = shader_io_get_unique_index(i);
if (lds_base) {
@@ -2887,7 +2923,7 @@ handle_es_outputs_post(struct radv_shader_context *ctx,
"");
}
- for (j = 0; j < length; j++) {
+ for (j = 0; j < 4; j++) {
if (!(output_usage_mask & (1 << j)))
continue;
@@ -2924,22 +2960,16 @@ handle_ls_outputs_post(struct radv_shader_context *ctx)
vertex_dw_stride, "");
for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
- unsigned output_usage_mask =
- ctx->shader_info->info.vs.output_usage_mask[i];
LLVMValueRef *out_ptr = &ctx->abi.outputs[i * 4];
- int length = 4;
if (!(ctx->output_mask & (1ull << i)))
continue;
- if (i == VARYING_SLOT_CLIP_DIST0)
- length = util_last_bit(output_usage_mask);
-
int param = shader_io_get_unique_index(i);
LLVMValueRef dw_addr = LLVMBuildAdd(ctx->ac.builder, base_dw_addr,
LLVMConstInt(ctx->ac.i32, param * 4, false),
"");
- for (unsigned j = 0; j < length; j++) {
+ for (unsigned j = 0; j < 4; j++) {
LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, out_ptr[j], "");
value = ac_to_integer(&ctx->ac, value);
value = LLVMBuildZExtOrBitCast(ctx->ac.builder, value, ctx->ac.i32, "");
@@ -3467,10 +3497,17 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
ctx.abi.clamp_shadow_reference = false;
ctx.abi.gfx9_stride_size_workaround = ctx.ac.chip_class == GFX9 && HAVE_LLVM < 0x800;
+ /* Because the new raw/struct atomic intrinsics are buggy with LLVM 8,
+ * we fallback to the old intrinsics for atomic buffer image operations
+ * and thus we need to apply the indexing workaround...
+ */
+ ctx.abi.gfx9_stride_size_workaround_for_atomic = ctx.ac.chip_class == GFX9 && HAVE_LLVM < 0x900;
+
if (shader_count >= 2)
ac_init_exec_full_mask(&ctx.ac);
- if (ctx.ac.chip_class == GFX9 &&
+ if ((ctx.ac.family == CHIP_VEGA10 ||
+ ctx.ac.family == CHIP_RAVEN) &&
shaders[shader_count - 1]->info.stage == MESA_SHADER_TESS_CTRL)
ac_nir_fixup_ls_hs_input_vgprs(&ctx);
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index 138e153f9a4..2526000f56f 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -524,6 +524,14 @@ radv_pipeline_compute_spi_color_formats(struct radv_pipeline *pipeline,
col_format |= cf << (4 * i);
}
+ if (!col_format && blend->need_src_alpha & (1 << 0)) {
+ /* When a subpass doesn't have any color attachments, write the
+ * alpha channel of MRT0 when alpha coverage is enabled because
+ * the depth attachment needs it.
+ */
+ col_format |= V_028714_SPI_SHADER_32_ABGR;
+ }
+
/* If the i-th target format is set, all previous target formats must
* be non-zero to avoid hangs.
*/
@@ -689,6 +697,7 @@ radv_pipeline_init_blend_state(struct radv_pipeline *pipeline,
if (vkms && vkms->alphaToCoverageEnable) {
blend.db_alpha_to_mask |= S_028B70_ALPHA_TO_MASK_ENABLE(1);
+ blend.need_src_alpha |= 0x1;
}
blend.cb_target_mask = 0;
@@ -1436,11 +1445,13 @@ radv_pipeline_init_dynamic_state(struct radv_pipeline *pipeline,
const VkPipelineDiscardRectangleStateCreateInfoEXT *discard_rectangle_info =
vk_find_struct_const(pCreateInfo->pNext, PIPELINE_DISCARD_RECTANGLE_STATE_CREATE_INFO_EXT);
- if (states & RADV_DYNAMIC_DISCARD_RECTANGLE) {
+ if (needed_states & RADV_DYNAMIC_DISCARD_RECTANGLE) {
dynamic->discard_rectangle.count = discard_rectangle_info->discardRectangleCount;
- typed_memcpy(dynamic->discard_rectangle.rectangles,
- discard_rectangle_info->pDiscardRectangles,
- discard_rectangle_info->discardRectangleCount);
+ if (states & RADV_DYNAMIC_DISCARD_RECTANGLE) {
+ typed_memcpy(dynamic->discard_rectangle.rectangles,
+ discard_rectangle_info->pDiscardRectangles,
+ discard_rectangle_info->discardRectangleCount);
+ }
}
pipeline->dynamic_state.mask = states;
@@ -1913,6 +1924,8 @@ radv_generate_graphics_pipeline_key(struct radv_pipeline *pipeline,
}
key.vertex_alpha_adjust |= adjust << (2 * location);
}
+
+ key.vertex_attribute_provided |= 1 << location;
}
if (pCreateInfo->pTessellationState)
@@ -1941,6 +1954,7 @@ radv_fill_shader_keys(struct radv_shader_variant_key *keys,
{
keys[MESA_SHADER_VERTEX].vs.instance_rate_inputs = key->instance_rate_inputs;
keys[MESA_SHADER_VERTEX].vs.alpha_adjust = key->vertex_alpha_adjust;
+ keys[MESA_SHADER_VERTEX].vs.vertex_attribute_provided = key->vertex_attribute_provided;
for (unsigned i = 0; i < MAX_VERTEX_ATTRIBS; ++i)
keys[MESA_SHADER_VERTEX].vs.instance_rate_divisors[i] = key->instance_rate_divisors[i];
@@ -3079,13 +3093,17 @@ radv_pipeline_generate_geometry_shader(struct radeon_cmdbuf *ctx_cs,
radv_pipeline_generate_hw_vs(ctx_cs, cs, pipeline, pipeline->gs_copy_shader);
}
-static uint32_t offset_to_ps_input(uint32_t offset, bool flat_shade)
+static uint32_t offset_to_ps_input(uint32_t offset, bool flat_shade, bool float16)
{
uint32_t ps_input_cntl;
if (offset <= AC_EXP_PARAM_OFFSET_31) {
ps_input_cntl = S_028644_OFFSET(offset);
if (flat_shade)
ps_input_cntl |= S_028644_FLAT_SHADE(1);
+ if (float16) {
+ ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) |
+ S_028644_ATTR0_VALID(1);
+ }
} else {
/* The input is a DEFAULT_VAL constant. */
assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 &&
@@ -3110,7 +3128,7 @@ radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf *ctx_cs,
if (ps->info.info.ps.prim_id_input) {
unsigned vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID];
if (vs_offset != AC_EXP_PARAM_UNDEFINED) {
- ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true);
+ ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true, false);
++ps_offset;
}
}
@@ -3120,9 +3138,9 @@ radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf *ctx_cs,
ps->info.info.needs_multiview_view_index) {
unsigned vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_LAYER];
if (vs_offset != AC_EXP_PARAM_UNDEFINED)
- ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true);
+ ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true, false);
else
- ps_input_cntl[ps_offset] = offset_to_ps_input(AC_EXP_PARAM_DEFAULT_VAL_0000, true);
+ ps_input_cntl[ps_offset] = offset_to_ps_input(AC_EXP_PARAM_DEFAULT_VAL_0000, true, false);
++ps_offset;
}
@@ -3138,14 +3156,14 @@ radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf *ctx_cs,
vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_CLIP_DIST0];
if (vs_offset != AC_EXP_PARAM_UNDEFINED) {
- ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, false);
+ ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, false, false);
++ps_offset;
}
vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_CLIP_DIST1];
if (vs_offset != AC_EXP_PARAM_UNDEFINED &&
ps->info.info.ps.num_input_clips_culls > 4) {
- ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, false);
+ ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, false, false);
++ps_offset;
}
}
@@ -3153,6 +3171,7 @@ radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf *ctx_cs,
for (unsigned i = 0; i < 32 && (1u << i) <= ps->info.fs.input_mask; ++i) {
unsigned vs_offset;
bool flat_shade;
+ bool float16;
if (!(ps->info.fs.input_mask & (1u << i)))
continue;
@@ -3164,8 +3183,9 @@ radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf *ctx_cs,
}
flat_shade = !!(ps->info.fs.flat_shaded_mask & (1u << ps_offset));
+ float16 = !!(ps->info.fs.float16_shaded_mask & (1u << ps_offset));
- ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, flat_shade);
+ ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, flat_shade, float16);
++ps_offset;
}
@@ -3192,11 +3212,11 @@ radv_compute_db_shader_control(const struct radv_device *device,
bool disable_rbplus = device->physical_device->has_rbplus &&
!device->physical_device->rbplus_allowed;
- /* Do not enable the gl_SampleMask fragment shader output if MSAA is
- * disabled.
+ /* It shouldn't be needed to export gl_SampleMask when MSAA is disabled
+ * but this appears to break Project Cars (DXVK). See
+ * https://bugs.freedesktop.org/show_bug.cgi?id=109401
*/
- bool mask_export_enable = ms->num_samples > 1 &&
- ps->info.info.ps.writes_sample_mask;
+ bool mask_export_enable = ps->info.info.ps.writes_sample_mask;
return S_02880C_Z_EXPORT_ENABLE(ps->info.info.ps.writes_z) |
S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(ps->info.info.ps.writes_stencil) |
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index 85c18906f84..ea957ae6dab 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -306,6 +306,9 @@ struct radv_physical_device {
/* Whether DCC should be enabled for MSAA textures. */
bool dcc_msaa_allowed;
+ /* Whether LOAD_CONTEXT_REG packets are supported. */
+ bool has_load_ctx_reg_pkt;
+
/* This is the drivers on-disk cache used as a fallback as opposed to
* the pipeline cache defined by apps.
*/
@@ -362,6 +365,7 @@ struct radv_pipeline_cache {
struct radv_pipeline_key {
uint32_t instance_rate_inputs;
uint32_t instance_rate_divisors[MAX_VERTEX_ATTRIBS];
+ uint32_t vertex_attribute_provided;
uint64_t vertex_alpha_adjust;
unsigned tess_input_vertices;
uint32_t col_format;
@@ -1144,6 +1148,7 @@ void si_write_scissors(struct radeon_cmdbuf *cs, int first,
const VkViewport *viewports, bool can_use_guardband);
uint32_t si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer,
bool instanced_draw, bool indirect_draw,
+ bool count_from_stream_output,
uint32_t draw_vertex_count);
void si_cs_emit_write_event_eop(struct radeon_cmdbuf *cs,
enum chip_class chip_class,
@@ -1462,6 +1467,7 @@ bool radv_format_pack_clear_color(VkFormat format,
bool radv_is_colorbuffer_format_supported(VkFormat format, bool *blendable);
bool radv_dcc_formats_compatible(VkFormat format1,
VkFormat format2);
+bool radv_device_supports_etc(struct radv_physical_device *physical_device);
struct radv_fmask_info {
uint64_t offset;
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index 32cd9ae25e9..ec571e2f8c5 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -159,7 +159,7 @@ radv_optimize_nir(struct nir_shader *shader, bool optimize_conservatively,
NIR_PASS(progress, shader, nir_opt_if);
NIR_PASS(progress, shader, nir_opt_dead_cf);
NIR_PASS(progress, shader, nir_opt_cse);
- NIR_PASS(progress, shader, nir_opt_peephole_select, 8, true, true);
+ NIR_PASS(progress, shader, nir_opt_peephole_select, 8, true);
NIR_PASS(progress, shader, nir_opt_algebraic);
NIR_PASS(progress, shader, nir_opt_constant_folding);
NIR_PASS(progress, shader, nir_opt_undef);
@@ -222,6 +222,8 @@ radv_shader_compile_to_nir(struct radv_device *device,
.lower_ubo_ssbo_access_to_offsets = true,
.caps = {
.descriptor_array_dynamic_indexing = true,
+ .descriptor_array_non_uniform_indexing = true,
+ .descriptor_indexing = true,
.device_group = true,
.draw_parameters = true,
.float64 = true,
@@ -610,6 +612,8 @@ shader_variant_create(struct radv_device *device,
tm_options |= AC_TM_SISCHED;
if (options->check_ir)
tm_options |= AC_TM_CHECK_IR;
+ if (device->instance->debug_flags & RADV_DEBUG_NO_LOAD_STORE_OPT)
+ tm_options |= AC_TM_NO_LOAD_STORE_OPT;
thread_compiler = !(device->instance->debug_flags & RADV_DEBUG_NOTHREADLLVM);
radv_init_llvm_once();
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
index 3652a811e80..f6f9dd2bbf1 100644
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@@ -66,6 +66,9 @@ struct radv_vs_variant_key {
uint32_t instance_rate_inputs;
uint32_t instance_rate_divisors[MAX_VERTEX_ATTRIBS];
+ /* Mask of vertex attributes that are provided by the pipeline. */
+ uint32_t vertex_attribute_provided;
+
/* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
* so we may need to fix it up. */
uint64_t alpha_adjust;
@@ -257,6 +260,7 @@ struct radv_shader_variant_info {
unsigned num_interp;
uint32_t input_mask;
uint32_t flat_shaded_mask;
+ uint32_t float16_shaded_mask;
bool can_discard;
bool early_fragment_test;
} fs;
@@ -401,6 +405,8 @@ static inline unsigned shader_io_get_unique_index(gl_varying_slot slot)
return 1;
if (slot == VARYING_SLOT_CLIP_DIST0)
return 2;
+ if (slot == VARYING_SLOT_CLIP_DIST1)
+ return 3;
/* 3 is reserved for clip dist as well */
if (slot >= VARYING_SLOT_VAR0 && slot <= VARYING_SLOT_VAR31)
return 4 + (slot - VARYING_SLOT_VAR0);
diff --git a/src/amd/vulkan/radv_shader_info.c b/src/amd/vulkan/radv_shader_info.c
index 7e5a3789af2..fdc4f52086b 100644
--- a/src/amd/vulkan/radv_shader_info.c
+++ b/src/amd/vulkan/radv_shader_info.c
@@ -101,7 +101,7 @@ gather_intrinsic_load_deref_info(const nir_shader *nir,
case MESA_SHADER_VERTEX: {
nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
- if (var->data.mode == nir_var_shader_in) {
+ if (var && var->data.mode == nir_var_shader_in) {
unsigned idx = var->data.location;
uint8_t mask = nir_ssa_def_components_read(&instr->dest.ssa);
@@ -115,6 +115,15 @@ gather_intrinsic_load_deref_info(const nir_shader *nir,
}
}
+static uint32_t
+widen_writemask(uint32_t wrmask)
+{
+ uint32_t new_wrmask = 0;
+ for(unsigned i = 0; i < 4; i++)
+ new_wrmask |= (wrmask & (1 << i) ? 0x3 : 0x0) << (i * 2);
+ return new_wrmask;
+}
+
static void
set_output_usage_mask(const nir_shader *nir, const nir_intrinsic_instr *instr,
uint8_t *output_usage_mask)
@@ -122,25 +131,27 @@ set_output_usage_mask(const nir_shader *nir, const nir_intrinsic_instr *instr,
nir_deref_instr *deref_instr =
nir_instr_as_deref(instr->src[0].ssa->parent_instr);
nir_variable *var = nir_deref_instr_get_variable(deref_instr);
- unsigned attrib_count = glsl_count_attribute_slots(var->type, false);
+ unsigned attrib_count = glsl_count_attribute_slots(deref_instr->type, false);
unsigned idx = var->data.location;
unsigned comp = var->data.location_frac;
unsigned const_offset = 0;
get_deref_offset(deref_instr, &const_offset);
- if (idx == VARYING_SLOT_CLIP_DIST0) {
- /* Special case for clip/cull distances because there are
- * combined into a single array that contains both.
- */
- output_usage_mask[idx] |= 1 << const_offset;
+ if (var->data.compact) {
+ assert(!glsl_type_is_64bit(deref_instr->type));
+ const_offset += comp;
+ output_usage_mask[idx + const_offset / 4] |= 1 << (const_offset % 4);
return;
}
- for (unsigned i = 0; i < attrib_count; i++) {
+ uint32_t wrmask = nir_intrinsic_write_mask(instr);
+ if (glsl_type_is_64bit(deref_instr->type))
+ wrmask = widen_writemask(wrmask);
+
+ for (unsigned i = 0; i < attrib_count; i++)
output_usage_mask[idx + i + const_offset] |=
- instr->const_index[0] << comp;
- }
+ ((wrmask >> (i * 4)) & 0xf) << comp;
}
static void
@@ -150,7 +161,7 @@ gather_intrinsic_store_deref_info(const nir_shader *nir,
{
nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
- if (var->data.mode == nir_var_shader_out) {
+ if (var && var->data.mode == nir_var_shader_out) {
unsigned idx = var->data.location;
switch (nir->info.stage) {
@@ -174,13 +185,9 @@ gather_intrinsic_store_deref_info(const nir_shader *nir,
type = glsl_get_array_element(var->type);
unsigned slots =
- var->data.compact ? DIV_ROUND_UP(glsl_get_length(type), 4)
+ var->data.compact ? DIV_ROUND_UP(var->data.location_frac + glsl_get_length(type), 4)
: glsl_count_attribute_slots(type, false);
- if (idx == VARYING_SLOT_CLIP_DIST0)
- slots = (nir->info.clip_distance_array_size +
- nir->info.cull_distance_array_size > 4) ? 2 : 1;
-
mark_tess_output(info, var->data.patch, param, slots);
break;
}
@@ -374,7 +381,8 @@ gather_info_input_decl_ps(const nir_shader *nir, const nir_variable *var,
info->ps.layer_input = true;
break;
case VARYING_SLOT_CLIP_DIST0:
- info->ps.num_input_clips_culls = attrib_count;
+ case VARYING_SLOT_CLIP_DIST1:
+ info->ps.num_input_clips_culls += attrib_count;
break;
default:
break;
@@ -409,8 +417,8 @@ gather_info_output_decl_ls(const nir_shader *nir, const nir_variable *var,
int idx = var->data.location;
unsigned param = shader_io_get_unique_index(idx);
int num_slots = glsl_count_attribute_slots(var->type, false);
- if (idx == VARYING_SLOT_CLIP_DIST0)
- num_slots = (nir->info.clip_distance_array_size + nir->info.cull_distance_array_size > 4) ? 2 : 1;
+ if (var->data.compact)
+ num_slots = DIV_ROUND_UP(var->data.location_frac + glsl_get_length(var->type), 4);
mark_ls_output(info, param, num_slots);
}
diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c
index e75c6d127d6..e73c13762e5 100644
--- a/src/amd/vulkan/si_cmd_buffer.c
+++ b/src/amd/vulkan/si_cmd_buffer.c
@@ -561,6 +561,7 @@ radv_prims_for_vertices(struct radv_prim_vertex_count *info, unsigned num)
uint32_t
si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer,
bool instanced_draw, bool indirect_draw,
+ bool count_from_stream_output,
uint32_t draw_vertex_count)
{
enum chip_class chip_class = cmd_buffer->device->physical_device->rad_info.chip_class;
@@ -622,6 +623,12 @@ si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer,
(instanced_draw || indirect_draw))
partial_vs_wave = true;
+ /* Hardware requirement when drawing primitives from a stream
+ * output buffer.
+ */
+ if (count_from_stream_output)
+ wd_switch_on_eop = true;
+
/* If the WD switch is false, the IA switch must be false too. */
assert(wd_switch_on_eop || !ia_switch_on_eop);
}
diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
index d3b1e2cd4c6..49a86a72c31 100644
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
@@ -543,7 +543,7 @@ static void radv_amdgpu_cs_add_buffer_internal(struct radv_amdgpu_cs *cs,
cs->handles[cs->num_buffers].bo_handle = bo;
cs->handles[cs->num_buffers].bo_priority = priority;
- hash = ((uintptr_t)bo >> 6) & (ARRAY_SIZE(cs->buffer_hash_table) - 1);
+ hash = bo & (ARRAY_SIZE(cs->buffer_hash_table) - 1);
cs->buffer_hash_table[hash] = cs->num_buffers;
++cs->num_buffers;
@@ -665,6 +665,7 @@ static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws,
assert(num < ws->num_buffers);
handles[num].bo_handle = bo->bo_handle;
handles[num].bo_priority = bo->priority;
+ num++;
}
r = amdgpu_bo_list_create_raw(ws->dev, ws->num_buffers,
diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys_public.h b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys_public.h
index 854e216551f..709669b2a57 100644
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys_public.h
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys_public.h
@@ -29,6 +29,13 @@
#ifndef RADV_AMDGPU_WINSYS_PUBLIC_H
#define RADV_AMDGPU_WINSYS_PUBLIC_H
+/* The number of IBs per submit isn't infinite, it depends on the ring type
+ * (ie. some initial setup needed for a submit) and the number of IBs (4 DW).
+ * This limit is arbitrary but should be safe for now. Ideally, we should get
+ * this limit from the KMD.
+*/
+#define RADV_MAX_IBS_PER_SUBMIT 192
+
struct radeon_winsys *radv_amdgpu_winsys_create(int fd, uint64_t debug_flags,
uint64_t perftest_flags);
diff --git a/src/broadcom/cle/v3d_packet_v33.xml b/src/broadcom/cle/v3d_packet_v33.xml
index 754461dc067..06e8ddad7ec 100644
--- a/src/broadcom/cle/v3d_packet_v33.xml
+++ b/src/broadcom/cle/v3d_packet_v33.xml
@@ -820,8 +820,8 @@
-
-
+
+
diff --git a/src/broadcom/common/v3d_cpu_tiling.h b/src/broadcom/common/v3d_cpu_tiling.h
index e10b4586609..cb1ee7c96f4 100644
--- a/src/broadcom/common/v3d_cpu_tiling.h
+++ b/src/broadcom/common/v3d_cpu_tiling.h
@@ -159,9 +159,8 @@ v3d_store_utile(void *gpu, uint32_t gpu_stride,
* d0-d7.
*/
"vstm %[gpu], {q0, q1, q2, q3}\n"
- :
+ : [cpu] "+r"(cpu)
: [gpu] "r"(gpu),
- [cpu] "r"(cpu),
[cpu_stride] "r"(cpu_stride)
: "q0", "q1", "q2", "q3");
return;
diff --git a/src/broadcom/common/v3d_limits.h b/src/broadcom/common/v3d_limits.h
index ee7a3e6bc00..e21ee246eff 100644
--- a/src/broadcom/common/v3d_limits.h
+++ b/src/broadcom/common/v3d_limits.h
@@ -32,7 +32,8 @@
*/
#define V3D_MAX_TEXTURE_SAMPLERS 16
-#define V3D_MAX_MIP_LEVELS 12
+/* The HW can do 16384 (15), but we run into hangs when we expose that. */
+#define V3D_MAX_MIP_LEVELS 13
#define V3D_MAX_SAMPLES 4
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index a5e75f650e8..bd19bb9b0b6 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -121,7 +121,7 @@ vir_emit_thrsw(struct v3d_compile *c)
*/
c->last_thrsw = vir_NOP(c);
c->last_thrsw->qpu.sig.thrsw = true;
- c->last_thrsw_at_top_level = (c->execute.file == QFILE_NULL);
+ c->last_thrsw_at_top_level = !c->in_control_flow;
}
static uint32_t
@@ -1158,7 +1158,9 @@ emit_frag_end(struct v3d_compile *c)
inst->src[vir_get_implicit_uniform_src(inst)] =
vir_uniform_ui(c, tlb_specifier | 0xffffff00);
+ c->writes_z = true;
} else if (c->s->info.fs.uses_discard ||
+ !c->s->info.fs.early_fragment_tests ||
c->fs_key->sample_alpha_to_coverage ||
!has_any_tlb_color_write) {
/* Emit passthrough Z if it needed to be delayed until shader
@@ -1188,6 +1190,7 @@ emit_frag_end(struct v3d_compile *c)
inst->src[vir_get_implicit_uniform_src(inst)] =
vir_uniform_ui(c, tlb_specifier | 0xffffff00);
+ c->writes_z = true;
}
/* XXX: Performance improvement: Merge Z write and color writes TLB
@@ -1455,7 +1458,7 @@ v3d_optimize_nir(struct nir_shader *s)
NIR_PASS(progress, s, nir_opt_dce);
NIR_PASS(progress, s, nir_opt_dead_cf);
NIR_PASS(progress, s, nir_opt_cse);
- NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);
+ NIR_PASS(progress, s, nir_opt_peephole_select, 8, true);
NIR_PASS(progress, s, nir_opt_algebraic);
NIR_PASS(progress, s, nir_opt_constant_folding);
NIR_PASS(progress, s, nir_opt_undef);
@@ -2103,10 +2106,10 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
else
else_block = vir_new_block(c);
- bool was_top_level = false;
+ bool was_uniform_control_flow = false;
if (c->execute.file == QFILE_NULL) {
c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
- was_top_level = true;
+ was_uniform_control_flow = true;
}
/* Set up the flags for the IF condition (taking the THEN branch). */
@@ -2122,7 +2125,7 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
/* Update the flags+cond to mean "Taking the ELSE branch (!cond) and
* was previously active (execute Z) for updating the exec flags.
*/
- if (was_top_level) {
+ if (was_uniform_control_flow) {
cond = v3d_qpu_cond_invert(cond);
} else {
struct qinst *inst = vir_MOV_dest(c, vir_reg(QFILE_NULL, 0),
@@ -2176,7 +2179,7 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
vir_link_blocks(c->cur_block, after_block);
vir_set_emit_block(c, after_block);
- if (was_top_level)
+ if (was_uniform_control_flow)
c->execute = c->undef;
else
ntq_activate_execute_for_block(c);
@@ -2185,12 +2188,15 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
static void
ntq_emit_if(struct v3d_compile *c, nir_if *nif)
{
+ bool was_in_control_flow = c->in_control_flow;
+ c->in_control_flow = true;
if (c->execute.file == QFILE_NULL &&
nir_src_is_dynamically_uniform(nif->condition)) {
ntq_emit_uniform_if(c, nif);
} else {
ntq_emit_nonuniform_if(c, nif);
}
+ c->in_control_flow = was_in_control_flow;
}
static void
@@ -2267,10 +2273,13 @@ static void ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
static void
ntq_emit_loop(struct v3d_compile *c, nir_loop *loop)
{
- bool was_top_level = false;
+ bool was_in_control_flow = c->in_control_flow;
+ c->in_control_flow = true;
+
+ bool was_uniform_control_flow = false;
if (c->execute.file == QFILE_NULL) {
c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
- was_top_level = true;
+ was_uniform_control_flow = true;
}
struct qblock *save_loop_cont_block = c->loop_cont_block;
@@ -2307,7 +2316,7 @@ ntq_emit_loop(struct v3d_compile *c, nir_loop *loop)
vir_link_blocks(c->cur_block, c->loop_break_block);
vir_set_emit_block(c, c->loop_break_block);
- if (was_top_level)
+ if (was_uniform_control_flow)
c->execute = c->undef;
else
ntq_activate_execute_for_block(c);
@@ -2316,6 +2325,8 @@ ntq_emit_loop(struct v3d_compile *c, nir_loop *loop)
c->loop_cont_block = save_loop_cont_block;
c->loops++;
+
+ c->in_control_flow = was_in_control_flow;
}
static void
diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
index 127b04136d1..671aba3c551 100644
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -519,6 +519,7 @@ struct v3d_compile {
uint32_t centroid_flags[BITSET_WORDS(V3D_MAX_FS_INPUTS)];
bool uses_center_w;
+ bool writes_z;
struct v3d_ubo_range *ubo_ranges;
bool *ubo_range_used;
@@ -531,6 +532,7 @@ struct v3d_compile {
* yes, otherwise a block number + 1 that the channel jumped to.
*/
struct qreg execute;
+ bool in_control_flow;
struct qreg line_x, point_x, point_y;
@@ -716,7 +718,7 @@ struct v3d_fs_prog_data {
uint32_t centroid_flags[((V3D_MAX_FS_INPUTS - 1) / 24) + 1];
bool writes_z;
- bool discard;
+ bool disable_ez;
bool uses_center_w;
};
diff --git a/src/broadcom/compiler/v3d_nir_lower_image_load_store.c b/src/broadcom/compiler/v3d_nir_lower_image_load_store.c
index e74206b3949..2aa3cbad495 100644
--- a/src/broadcom/compiler/v3d_nir_lower_image_load_store.c
+++ b/src/broadcom/compiler/v3d_nir_lower_image_load_store.c
@@ -156,7 +156,7 @@ pack_sint(nir_builder *b, nir_ssa_def *color, const unsigned *bits,
int num_components)
{
color = nir_channels(b, color, (1 << num_components) - 1);
- color = nir_format_clamp_uint(b, color, bits);
+ color = nir_format_clamp_sint(b, color, bits);
return pack_bits(b, color, bits, num_components, true);
}
diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
index 10105fbd861..20f7004149c 100644
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@@ -777,21 +777,9 @@ v3d_fs_set_prog_data(struct v3d_compile *c,
struct v3d_fs_prog_data *prog_data)
{
v3d_set_fs_prog_data_inputs(c, prog_data);
- prog_data->writes_z = (c->s->info.outputs_written &
- (1 << FRAG_RESULT_DEPTH));
- prog_data->discard = (c->s->info.fs.uses_discard ||
- c->fs_key->sample_alpha_to_coverage);
+ prog_data->writes_z = c->writes_z;
+ prog_data->disable_ez = !c->s->info.fs.early_fragment_tests;
prog_data->uses_center_w = c->uses_center_w;
-
- /* If the shader has some side effects and hasn't allowed early
- * fragment tests, disable them.
- */
- if (!c->s->info.fs.early_fragment_tests &&
- (c->s->info.num_images ||
- c->s->info.num_ssbos ||
- c->s->info.num_abos)) {
- prog_data->discard = true;
- }
}
static void
@@ -888,6 +876,15 @@ v3d_nir_lower_fs_early(struct v3d_compile *c)
{
if (c->fs_key->int_color_rb || c->fs_key->uint_color_rb)
v3d_fixup_fs_output_types(c);
+
+ /* If the shader has no non-TLB side effects, we can promote it to
+ * enabling early_fragment_tests even if the user didn't.
+ */
+ if (!(c->s->info.num_images ||
+ c->s->info.num_ssbos ||
+ c->s->info.num_abos)) {
+ c->s->info.fs.early_fragment_tests = true;
+ }
}
static void
diff --git a/src/compiler/Android.glsl.gen.mk b/src/compiler/Android.glsl.gen.mk
index e31eb6f101f..3b94ea7bd2f 100644
--- a/src/compiler/Android.glsl.gen.mk
+++ b/src/compiler/Android.glsl.gen.mk
@@ -104,6 +104,6 @@ $(intermediates)/glsl/ir_expression_operation_strings.h: $(LOCAL_PATH)/glsl/ir_e
@mkdir -p $(dir $@)
$(hide) $(MESA_PYTHON2) $< strings > $@
-$(intermediates)/compiler/glsl/float64_glsl.h: $(LOCAL_PATH)/glsl/xxd.py
+$(intermediates)/glsl/float64_glsl.h: $(LOCAL_PATH)/glsl/xxd.py
@mkdir -p $(dir $@)
$(hide) $(MESA_PYTHON2) $< $(MESA_TOP)/src/compiler/glsl/float64.glsl $@ -n float64_source > $@
diff --git a/src/compiler/Android.glsl.mk b/src/compiler/Android.glsl.mk
index 0aabafa2673..37b3cb80251 100644
--- a/src/compiler/Android.glsl.mk
+++ b/src/compiler/Android.glsl.mk
@@ -48,7 +48,7 @@ LOCAL_STATIC_LIBRARIES := \
libmesa_nir
LOCAL_MODULE := libmesa_glsl
-
+LOCAL_CFLAGS += -Wno-error
include $(LOCAL_PATH)/Android.glsl.gen.mk
include $(MESA_COMMON_MK)
include $(BUILD_STATIC_LIBRARY)
diff --git a/src/compiler/Android.nir.mk b/src/compiler/Android.nir.mk
index 75a247a245d..59da5dbdc1c 100644
--- a/src/compiler/Android.nir.mk
+++ b/src/compiler/Android.nir.mk
@@ -41,6 +41,9 @@ LOCAL_C_INCLUDES := \
$(MESA_TOP)/src/gallium/include \
$(MESA_TOP)/src/gallium/auxiliary
+LOCAL_CFLAGS := \
+ -Wno-missing-braces
+
LOCAL_STATIC_LIBRARIES := libmesa_compiler
LOCAL_MODULE := libmesa_nir
diff --git a/src/compiler/Makefile.sources b/src/compiler/Makefile.sources
index 489022a22a1..0b40c3c6ebe 100644
--- a/src/compiler/Makefile.sources
+++ b/src/compiler/Makefile.sources
@@ -229,6 +229,7 @@ NIR_FILES = \
nir/nir_lower_alpha_test.c \
nir/nir_lower_alu.c \
nir/nir_lower_alu_to_scalar.c \
+ nir/nir_lower_array_deref_of_vec.c \
nir/nir_lower_atomics_to_ssbo.c \
nir/nir_lower_bitmap.c \
nir/nir_lower_bit_size.c \
@@ -251,6 +252,7 @@ NIR_FILES = \
nir/nir_lower_io_arrays_to_elements.c \
nir/nir_lower_io_to_temporaries.c \
nir/nir_lower_io_to_scalar.c \
+ nir/nir_lower_io_to_vector.c \
nir/nir_lower_packing.c \
nir/nir_lower_passthrough_edgeflags.c \
nir/nir_lower_patch_vertices.c \
diff --git a/src/compiler/glsl/ast_to_hir.cpp b/src/compiler/glsl/ast_to_hir.cpp
index 620153e6a34..8c707265e44 100644
--- a/src/compiler/glsl/ast_to_hir.cpp
+++ b/src/compiler/glsl/ast_to_hir.cpp
@@ -3698,6 +3698,10 @@ apply_layout_qualifier_to_variable(const struct ast_type_qualifier *qual,
"cannot be applied to a matrix, a structure, "
"a block, or an array containing any of "
"these.");
+ } else if (components > 4 && type->is_64bit()) {
+ _mesa_glsl_error(loc, state, "component layout qualifier "
+ "cannot be applied to dvec%u.",
+ components / 2);
} else if (qual_component != 0 &&
(qual_component + components - 1) > 3) {
_mesa_glsl_error(loc, state, "component overflow (%u > 3)",
@@ -3940,7 +3944,8 @@ apply_type_qualifier_to_variable(const struct ast_type_qualifier *qual,
"`invariant' after being used",
var->name);
} else {
- var->data.invariant = 1;
+ var->data.explicit_invariant = true;
+ var->data.invariant = true;
}
}
@@ -4148,8 +4153,10 @@ apply_type_qualifier_to_variable(const struct ast_type_qualifier *qual,
}
}
- if (state->all_invariant && var->data.mode == ir_var_shader_out)
+ if (state->all_invariant && var->data.mode == ir_var_shader_out) {
+ var->data.explicit_invariant = true;
var->data.invariant = true;
+ }
var->data.interpolation =
interpret_interpolation_qualifier(qual, var->type,
@@ -4857,6 +4864,7 @@ ast_declarator_list::hir(exec_list *instructions,
"`invariant' after being used",
earlier->name);
} else {
+ earlier->data.explicit_invariant = true;
earlier->data.invariant = true;
}
}
diff --git a/src/compiler/glsl/gl_nir_lower_samplers_as_deref.c b/src/compiler/glsl/gl_nir_lower_samplers_as_deref.c
index 719968a6671..87718112db7 100644
--- a/src/compiler/glsl/gl_nir_lower_samplers_as_deref.c
+++ b/src/compiler/glsl/gl_nir_lower_samplers_as_deref.c
@@ -147,10 +147,20 @@ lower_deref(nir_builder *b, struct lower_samplers_as_deref_state *state,
remove_struct_derefs_prep(path.path, &name, &location, &type);
- assert(location < state->shader_program->data->NumUniformStorage &&
- state->shader_program->data->UniformStorage[location].opaque[stage].active);
+ if (state->shader_program && var->data.how_declared != nir_var_hidden) {
+ /* For GLSL programs, look up the bindings in the uniform storage. */
+ assert(location < state->shader_program->data->NumUniformStorage &&
+ state->shader_program->data->UniformStorage[location].opaque[stage].active);
- binding = state->shader_program->data->UniformStorage[location].opaque[stage].index;
+ binding = state->shader_program->data->UniformStorage[location].opaque[stage].index;
+ } else {
+ /* For ARB programs, built-in shaders, or internally generated sampler
+ * variables in GLSL programs, assume that whoever created the shader
+ * set the bindings correctly already.
+ */
+ assert(var->data.explicit_binding);
+ binding = var->data.binding;
+ }
if (var->type == type) {
/* Fast path: We did not encounter any struct derefs. */
@@ -167,6 +177,14 @@ lower_deref(nir_builder *b, struct lower_samplers_as_deref_state *state,
} else {
var = nir_variable_create(state->shader, nir_var_uniform, type, name);
var->data.binding = binding;
+
+ /* Don't set var->data.location. The old structure location could be
+ * used to index into gl_uniform_storage, assuming the full structure
+ * was walked in order. With the new split variables, this invariant
+ * no longer holds and there's no meaningful way to start from a base
+ * location and access a particular array element. Just leave it 0.
+ */
+
_mesa_hash_table_insert_pre_hashed(state->remap_table, hash, name, var);
}
diff --git a/src/compiler/glsl/glcpp/glcpp-parse.y b/src/compiler/glsl/glcpp/glcpp-parse.y
index 1c095cb66f9..c951d9526ac 100644
--- a/src/compiler/glsl/glcpp/glcpp-parse.y
+++ b/src/compiler/glsl/glcpp/glcpp-parse.y
@@ -224,10 +224,12 @@ expanded_line:
glcpp_error(& @1, parser, "undefined macro %s in expression (illegal in GLES)", $2.undefined_macro);
_glcpp_parser_skip_stack_change_if (parser, & @1, "elif", $2.value);
}
-| LINE_EXPANDED integer_constant NEWLINE {
+| LINE_EXPANDED expression NEWLINE {
+ if (parser->is_gles && $2.undefined_macro)
+ glcpp_error(& @1, parser, "undefined macro %s in expression (illegal in GLES)", $2.undefined_macro);
parser->has_new_line_number = 1;
- parser->new_line_number = $2;
- _mesa_string_buffer_printf(parser->output, "#line %" PRIiMAX "\n", $2);
+ parser->new_line_number = $2.value;
+ _mesa_string_buffer_printf(parser->output, "#line %" PRIiMAX "\n", $2.value);
}
| LINE_EXPANDED integer_constant integer_constant NEWLINE {
parser->has_new_line_number = 1;
@@ -238,6 +240,17 @@ expanded_line:
"#line %" PRIiMAX " %" PRIiMAX "\n",
$2, $3);
}
+| LINE_EXPANDED '(' expression ')' '(' expression ')' NEWLINE {
+ if (parser->is_gles && $3.undefined_macro)
+ glcpp_error(& @1, parser, "undefined macro %s in expression (illegal in GLES)", $3.undefined_macro);
+ if (parser->is_gles && $6.undefined_macro)
+ glcpp_error(& @1, parser, "undefined macro %s in expression (illegal in GLES)", $6.undefined_macro);
+ parser->has_new_line_number = 1;
+ parser->new_line_number = $3.value;
+ parser->has_new_source_number = 1;
+ parser->new_source_number = $6.value;
+ _mesa_string_buffer_printf(parser->output, "#line %" PRIiMAX " %" PRIiMAX "\n", $3.value, $6.value);
+ }
;
define:
diff --git a/src/compiler/glsl/glsl_to_nir.cpp b/src/compiler/glsl/glsl_to_nir.cpp
index d2db0f95aca..47fc2fea160 100644
--- a/src/compiler/glsl/glsl_to_nir.cpp
+++ b/src/compiler/glsl/glsl_to_nir.cpp
@@ -353,6 +353,12 @@ nir_visitor::visit(ir_variable *ir)
ir->data.location == VARYING_SLOT_TESS_LEVEL_OUTER)) {
var->data.compact = ir->type->without_array()->is_scalar();
}
+
+ if (shader->info.stage > MESA_SHADER_VERTEX &&
+ ir->data.location >= VARYING_SLOT_CLIP_DIST0 &&
+ ir->data.location <= VARYING_SLOT_CULL_DIST1) {
+ var->data.compact = ir->type->without_array()->is_scalar();
+ }
}
break;
@@ -363,6 +369,12 @@ nir_visitor::visit(ir_variable *ir)
ir->data.location == VARYING_SLOT_TESS_LEVEL_OUTER)) {
var->data.compact = ir->type->without_array()->is_scalar();
}
+
+ if (shader->info.stage <= MESA_SHADER_GEOMETRY &&
+ ir->data.location >= VARYING_SLOT_CLIP_DIST0 &&
+ ir->data.location <= VARYING_SLOT_CULL_DIST1) {
+ var->data.compact = ir->type->without_array()->is_scalar();
+ }
break;
case ir_var_uniform:
diff --git a/src/compiler/glsl/ir.cpp b/src/compiler/glsl/ir.cpp
index 1d1a56ae9a5..f5aa1be4e20 100644
--- a/src/compiler/glsl/ir.cpp
+++ b/src/compiler/glsl/ir.cpp
@@ -1734,6 +1734,7 @@ ir_variable::ir_variable(const struct glsl_type *type, const char *name,
this->data.centroid = false;
this->data.sample = false;
this->data.patch = false;
+ this->data.explicit_invariant = false;
this->data.invariant = false;
this->data.how_declared = ir_var_declared_normally;
this->data.mode = mode;
diff --git a/src/compiler/glsl/ir.h b/src/compiler/glsl/ir.h
index d05d1998a50..8b32ed8209a 100644
--- a/src/compiler/glsl/ir.h
+++ b/src/compiler/glsl/ir.h
@@ -233,7 +233,7 @@ class ir_rvalue : public ir_instruction {
ir_rvalue *as_rvalue_to_saturate();
- virtual bool is_lvalue(const struct _mesa_glsl_parse_state *state = NULL) const
+ virtual bool is_lvalue(const struct _mesa_glsl_parse_state * = NULL) const
{
return false;
}
@@ -657,6 +657,19 @@ class ir_variable : public ir_instruction {
unsigned centroid:1;
unsigned sample:1;
unsigned patch:1;
+ /**
+ * Was an 'invariant' qualifier explicitly set in the shader?
+ *
+ * This is used to cross validate qualifiers.
+ */
+ unsigned explicit_invariant:1;
+ /**
+ * Is the variable invariant?
+ *
+ * It can happen either by having the 'invariant' qualifier
+ * explicitly set in the shader or by being used in calculations
+ * of other invariant variables.
+ */
unsigned invariant:1;
unsigned precise:1;
diff --git a/src/compiler/glsl/ir_print_visitor.cpp b/src/compiler/glsl/ir_print_visitor.cpp
index ef6bca1229e..b055d25d60d 100644
--- a/src/compiler/glsl/ir_print_visitor.cpp
+++ b/src/compiler/glsl/ir_print_visitor.cpp
@@ -199,6 +199,7 @@ void ir_print_visitor::visit(ir_variable *ir)
const char *const samp = (ir->data.sample) ? "sample " : "";
const char *const patc = (ir->data.patch) ? "patch " : "";
const char *const inv = (ir->data.invariant) ? "invariant " : "";
+ const char *const explicit_inv = (ir->data.explicit_invariant) ? "explicit_invariant " : "";
const char *const prec = (ir->data.precise) ? "precise " : "";
const char *const bindless = (ir->data.bindless) ? "bindless " : "";
const char *const bound = (ir->data.bound) ? "bound " : "";
@@ -215,11 +216,11 @@ void ir_print_visitor::visit(ir_variable *ir)
const char *const interp[] = { "", "smooth", "flat", "noperspective" };
STATIC_ASSERT(ARRAY_SIZE(interp) == INTERP_MODE_COUNT);
- fprintf(f, "(%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s) ",
+ fprintf(f, "(%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s) ",
binding, loc, component, cent, bindless, bound,
image_format, memory_read_only, memory_write_only,
memory_coherent, memory_volatile, memory_restrict,
- samp, patc, inv, prec, mode[ir->data.mode],
+ samp, patc, inv, explicit_inv, prec, mode[ir->data.mode],
stream,
interp[ir->data.interpolation]);
diff --git a/src/compiler/glsl/ir_reader.cpp b/src/compiler/glsl/ir_reader.cpp
index b87933ba511..d4f0e58b155 100644
--- a/src/compiler/glsl/ir_reader.cpp
+++ b/src/compiler/glsl/ir_reader.cpp
@@ -419,8 +419,10 @@ ir_reader::read_declaration(s_expression *expr)
var->data.sample = 1;
} else if (strcmp(qualifier->value(), "patch") == 0) {
var->data.patch = 1;
+ } else if (strcmp(qualifier->value(), "explicit_invariant") == 0) {
+ var->data.explicit_invariant = true;
} else if (strcmp(qualifier->value(), "invariant") == 0) {
- var->data.invariant = 1;
+ var->data.invariant = true;
} else if (strcmp(qualifier->value(), "uniform") == 0) {
var->data.mode = ir_var_uniform;
} else if (strcmp(qualifier->value(), "shader_storage") == 0) {
diff --git a/src/compiler/glsl/link_uniform_block_active_visitor.cpp b/src/compiler/glsl/link_uniform_block_active_visitor.cpp
index 368981852c0..0af3b312071 100644
--- a/src/compiler/glsl/link_uniform_block_active_visitor.cpp
+++ b/src/compiler/glsl/link_uniform_block_active_visitor.cpp
@@ -103,6 +103,7 @@ process_arrays(void *mem_ctx, ir_dereference_array *ir,
if (*ub_array_ptr == NULL) {
*ub_array_ptr = rzalloc(mem_ctx, struct uniform_block_array_elements);
(*ub_array_ptr)->ir = ir;
+ (*ub_array_ptr)->original_dim_size = block->type->length;
}
struct uniform_block_array_elements *ub_array = *ub_array_ptr;
diff --git a/src/compiler/glsl/link_uniform_block_active_visitor.h b/src/compiler/glsl/link_uniform_block_active_visitor.h
index fbac65d5b67..a8ea3f52b6d 100644
--- a/src/compiler/glsl/link_uniform_block_active_visitor.h
+++ b/src/compiler/glsl/link_uniform_block_active_visitor.h
@@ -32,6 +32,7 @@ struct uniform_block_array_elements {
unsigned num_array_elements;
ir_dereference_array *ir;
+ unsigned original_dim_size;
struct uniform_block_array_elements *array;
};
diff --git a/src/compiler/glsl/link_uniform_blocks.cpp b/src/compiler/glsl/link_uniform_blocks.cpp
index 0b890586298..1665fc3f8cb 100644
--- a/src/compiler/glsl/link_uniform_blocks.cpp
+++ b/src/compiler/glsl/link_uniform_blocks.cpp
@@ -244,18 +244,21 @@ process_block_array(struct uniform_block_array_elements *ub_array, char **name,
for (unsigned j = 0; j < ub_array->num_array_elements; j++) {
size_t new_length = name_length;
+ unsigned int element_idx = ub_array->array_elements[j];
/* Append the subscript to the current variable name */
- ralloc_asprintf_rewrite_tail(name, &new_length, "[%u]",
- ub_array->array_elements[j]);
+ ralloc_asprintf_rewrite_tail(name, &new_length, "[%u]", element_idx);
if (ub_array->array) {
+ unsigned boffset = (*binding_offset) + (element_idx *
+ ub_array->original_dim_size);
process_block_array(ub_array->array, name, new_length, blocks,
parcel, variables, b, block_index,
- binding_offset, ctx, prog, first_index);
+ &boffset, ctx, prog, first_index);
} else {
+ unsigned boffset = (*binding_offset) + element_idx;
process_block_array_leaf(*name, blocks,
parcel, variables, b, block_index,
- binding_offset, *block_index - first_index,
+ &boffset, *block_index - first_index,
ctx, prog);
}
}
@@ -307,7 +310,6 @@ process_block_array_leaf(const char *name,
(unsigned)(ptrdiff_t)(&variables[parcel->index] - blocks[i].Uniforms);
*block_index = *block_index + 1;
- *binding_offset = *binding_offset + 1;
}
/* This function resizes the array types of the block so that later we can use
@@ -440,6 +442,7 @@ link_uniform_blocks(void *mem_ctx,
GLSL_INTERFACE_PACKING_PACKED)) {
b->type = resize_block_array(b->type, b->array);
b->var->type = b->type;
+ b->var->data.max_array_access = b->type->length - 1;
}
block_size.num_active_uniforms = 0;
diff --git a/src/compiler/glsl/link_uniforms.cpp b/src/compiler/glsl/link_uniforms.cpp
index 63e688b19a7..13fc603ce7a 100644
--- a/src/compiler/glsl/link_uniforms.cpp
+++ b/src/compiler/glsl/link_uniforms.cpp
@@ -62,6 +62,15 @@ program_resource_visitor::process(const glsl_type *type, const char *name,
void
program_resource_visitor::process(ir_variable *var, bool use_std430_as_default)
+{
+ const glsl_type *t =
+ var->data.from_named_ifc_block ? var->get_interface_type() : var->type;
+ process(var, t, use_std430_as_default);
+}
+
+void
+program_resource_visitor::process(ir_variable *var, const glsl_type *var_type,
+ bool use_std430_as_default)
{
unsigned record_array_count = 1;
const bool row_major =
@@ -72,8 +81,7 @@ program_resource_visitor::process(ir_variable *var, bool use_std430_as_default)
get_internal_ifc_packing(use_std430_as_default) :
var->type->get_internal_ifc_packing(use_std430_as_default);
- const glsl_type *t =
- var->data.from_named_ifc_block ? var->get_interface_type() : var->type;
+ const glsl_type *t = var_type;
const glsl_type *t_without_array = t->without_array();
/* false is always passed for the row_major parameter to the other
diff --git a/src/compiler/glsl/link_varyings.cpp b/src/compiler/glsl/link_varyings.cpp
index 3969c0120b3..28187e2f0a4 100644
--- a/src/compiler/glsl/link_varyings.cpp
+++ b/src/compiler/glsl/link_varyings.cpp
@@ -309,16 +309,16 @@ cross_validate_types_and_qualifiers(struct gl_context *ctx,
* "The invariance of varyings that are declared in both the vertex
* and fragment shaders must match."
*/
- if (input->data.invariant != output->data.invariant &&
+ if (input->data.explicit_invariant != output->data.explicit_invariant &&
prog->data->Version < (prog->IsES ? 300 : 430)) {
linker_error(prog,
"%s shader output `%s' %s invariant qualifier, "
"but %s shader input %s invariant qualifier\n",
_mesa_shader_stage_to_string(producer_stage),
output->name,
- (output->data.invariant) ? "has" : "lacks",
+ (output->data.explicit_invariant) ? "has" : "lacks",
_mesa_shader_stage_to_string(consumer_stage),
- (input->data.invariant) ? "has" : "lacks");
+ (input->data.explicit_invariant) ? "has" : "lacks");
return;
}
@@ -424,28 +424,14 @@ compute_variable_location_slot(ir_variable *var, gl_shader_stage stage)
struct explicit_location_info {
ir_variable *var;
- unsigned numerical_type;
+ bool base_type_is_integer;
+ unsigned base_type_bit_size;
unsigned interpolation;
bool centroid;
bool sample;
bool patch;
};
-static inline unsigned
-get_numerical_type(const glsl_type *type)
-{
- /* From the OpenGL 4.6 spec, section 4.4.1 Input Layout Qualifiers, Page 68,
- * (Location aliasing):
- *
- * "Further, when location aliasing, the aliases sharing the location
- * must have the same underlying numerical type (floating-point or
- * integer)
- */
- if (type->is_float() || type->is_double())
- return GLSL_TYPE_FLOAT;
- return GLSL_TYPE_INT;
-}
-
static bool
check_location_aliasing(struct explicit_location_info explicit_locations[][4],
ir_variable *var,
@@ -461,14 +447,23 @@ check_location_aliasing(struct explicit_location_info explicit_locations[][4],
gl_shader_stage stage)
{
unsigned last_comp;
- if (type->without_array()->is_record()) {
- /* The component qualifier can't be used on structs so just treat
- * all component slots as used.
+ unsigned base_type_bit_size;
+ const glsl_type *type_without_array = type->without_array();
+ const bool base_type_is_integer =
+ glsl_base_type_is_integer(type_without_array->base_type);
+ const bool is_struct = type_without_array->is_record();
+ if (is_struct) {
+ /* structs don't have a defined underlying base type so just treat all
+ * component slots as used and set the bit size to 0. If there is
+ * location aliasing, we'll fail anyway later.
*/
last_comp = 4;
+ base_type_bit_size = 0;
} else {
- unsigned dmul = type->without_array()->is_64bit() ? 2 : 1;
- last_comp = component + type->without_array()->vector_elements * dmul;
+ unsigned dmul = type_without_array->is_64bit() ? 2 : 1;
+ last_comp = component + type_without_array->vector_elements * dmul;
+ base_type_bit_size =
+ glsl_base_type_get_bit_size(type_without_array->base_type);
}
while (location < location_limit) {
@@ -478,8 +473,22 @@ check_location_aliasing(struct explicit_location_info explicit_locations[][4],
&explicit_locations[location][comp];
if (info->var) {
- /* Component aliasing is not alloed */
- if (comp >= component && comp < last_comp) {
+ if (info->var->type->without_array()->is_record() || is_struct) {
+ /* Structs cannot share location since they are incompatible
+ * with any other underlying numerical type.
+ */
+ linker_error(prog,
+ "%s shader has multiple %sputs sharing the "
+ "same location that don't have the same "
+ "underlying numerical type. Struct variable '%s', "
+ "location %u\n",
+ _mesa_shader_stage_to_string(stage),
+ var->data.mode == ir_var_shader_in ? "in" : "out",
+ is_struct ? var->name : info->var->name,
+ location);
+ return false;
+ } else if (comp >= component && comp < last_comp) {
+ /* Component aliasing is not allowed */
linker_error(prog,
"%s shader has multiple %sputs explicitly "
"assigned to location %d and component %d\n",
@@ -488,27 +497,52 @@ check_location_aliasing(struct explicit_location_info explicit_locations[][4],
location, comp);
return false;
} else {
- /* For all other used components we need to have matching
- * types, interpolation and auxiliary storage
+ /* From the OpenGL 4.60.5 spec, section 4.4.1 Input Layout
+ * Qualifiers, Page 67, (Location aliasing):
+ *
+ * " Further, when location aliasing, the aliases sharing the
+ * location must have the same underlying numerical type
+ * and bit width (floating-point or integer, 32-bit versus
+ * 64-bit, etc.) and the same auxiliary storage and
+ * interpolation qualification."
*/
- if (info->numerical_type !=
- get_numerical_type(type->without_array())) {
+
+ /* If the underlying numerical type isn't integer, implicitly
+ * it will be float or else we would have failed by now.
+ */
+ if (info->base_type_is_integer != base_type_is_integer) {
linker_error(prog,
- "Varyings sharing the same location must "
- "have the same underlying numerical type. "
- "Location %u component %u\n",
- location, comp);
+ "%s shader has multiple %sputs sharing the "
+ "same location that don't have the same "
+ "underlying numerical type. Location %u "
+ "component %u.\n",
+ _mesa_shader_stage_to_string(stage),
+ var->data.mode == ir_var_shader_in ?
+ "in" : "out", location, comp);
+ return false;
+ }
+
+ if (info->base_type_bit_size != base_type_bit_size) {
+ linker_error(prog,
+ "%s shader has multiple %sputs sharing the "
+ "same location that don't have the same "
+ "underlying numerical bit size. Location %u "
+ "component %u.\n",
+ _mesa_shader_stage_to_string(stage),
+ var->data.mode == ir_var_shader_in ?
+ "in" : "out", location, comp);
return false;
}
if (info->interpolation != interpolation) {
linker_error(prog,
- "%s shader has multiple %sputs at explicit "
- "location %u with different interpolation "
- "settings\n",
+ "%s shader has multiple %sputs sharing the "
+ "same location that don't have the same "
+ "interpolation qualification. Location %u "
+ "component %u.\n",
_mesa_shader_stage_to_string(stage),
var->data.mode == ir_var_shader_in ?
- "in" : "out", location);
+ "in" : "out", location, comp);
return false;
}
@@ -516,17 +550,20 @@ check_location_aliasing(struct explicit_location_info explicit_locations[][4],
info->sample != sample ||
info->patch != patch) {
linker_error(prog,
- "%s shader has multiple %sputs at explicit "
- "location %u with different aux storage\n",
+ "%s shader has multiple %sputs sharing the "
+ "same location that don't have the same "
+ "auxiliary storage qualification. Location %u "
+ "component %u.\n",
_mesa_shader_stage_to_string(stage),
var->data.mode == ir_var_shader_in ?
- "in" : "out", location);
+ "in" : "out", location, comp);
return false;
}
}
} else if (comp >= component && comp < last_comp) {
info->var = var;
- info->numerical_type = get_numerical_type(type->without_array());
+ info->base_type_is_integer = base_type_is_integer;
+ info->base_type_bit_size = base_type_bit_size;
info->interpolation = interpolation;
info->centroid = centroid;
info->sample = sample;
@@ -773,8 +810,20 @@ cross_validate_outputs_to_inputs(struct gl_context *ctx,
output = explicit_locations[idx][input->data.location_frac].var;
- if (output == NULL ||
- input->data.location != output->data.location) {
+ if (output == NULL) {
+ /* A linker failure should only happen when there is no
+ * output declaration and there is Static Use of the
+ * declared input.
+ */
+ if (input->data.used) {
+ linker_error(prog,
+ "%s shader input `%s' with explicit location "
+ "has no matching output\n",
+ _mesa_shader_stage_to_string(consumer->Stage),
+ input->name);
+ break;
+ }
+ } else if (input->data.location != output->data.location) {
linker_error(prog,
"%s shader input `%s' with explicit location "
"has no matching output\n",
@@ -804,7 +853,7 @@ cross_validate_outputs_to_inputs(struct gl_context *ctx,
*/
assert(!input->data.assigned);
if (input->data.used && !input->get_interface_type() &&
- !input->data.explicit_location && !prog->SeparateShader)
+ !input->data.explicit_location)
linker_error(prog,
"%s shader input `%s' "
"has no matching output in the previous stage\n",
@@ -1166,8 +1215,7 @@ tfeedback_decl::store(struct gl_context *ctx, struct gl_shader_program *prog,
return false;
}
- if ((this->offset / 4) / info->Buffers[buffer].Stride !=
- (xfb_offset - 1) / info->Buffers[buffer].Stride) {
+ if (xfb_offset > info->Buffers[buffer].Stride) {
linker_error(prog, "xfb_offset (%d) overflows xfb_stride (%d) for "
"buffer (%d)", xfb_offset * 4,
info->Buffers[buffer].Stride * 4, buffer);
@@ -2124,9 +2172,11 @@ class tfeedback_candidate_generator : public program_resource_visitor
{
public:
tfeedback_candidate_generator(void *mem_ctx,
- hash_table *tfeedback_candidates)
+ hash_table *tfeedback_candidates,
+ gl_shader_stage stage)
: mem_ctx(mem_ctx),
tfeedback_candidates(tfeedback_candidates),
+ stage(stage),
toplevel_var(NULL),
varying_floats(0)
{
@@ -2136,10 +2186,17 @@ class tfeedback_candidate_generator : public program_resource_visitor
{
/* All named varying interface blocks should be flattened by now */
assert(!var->is_interface_instance());
+ assert(var->data.mode == ir_var_shader_out);
this->toplevel_var = var;
this->varying_floats = 0;
- program_resource_visitor::process(var, false);
+ const glsl_type *t =
+ var->data.from_named_ifc_block ? var->get_interface_type() : var->type;
+ if (!var->data.patch && stage == MESA_SHADER_TESS_CTRL) {
+ assert(t->is_array());
+ t = t->fields.array;
+ }
+ program_resource_visitor::process(var, t, false);
}
private:
@@ -2173,6 +2230,8 @@ class tfeedback_candidate_generator : public program_resource_visitor
*/
hash_table * const tfeedback_candidates;
+ gl_shader_stage stage;
+
/**
* Pointer to the toplevel variable that is being traversed.
*/
@@ -2503,8 +2562,28 @@ assign_varying_locations(struct gl_context *ctx,
producer->Stage == MESA_SHADER_GEOMETRY));
if (num_tfeedback_decls > 0) {
- tfeedback_candidate_generator g(mem_ctx, tfeedback_candidates);
- g.process(output_var);
+ tfeedback_candidate_generator g(mem_ctx, tfeedback_candidates, producer->Stage);
+ /* From OpenGL 4.6 (Core Profile) spec, section 11.1.2.1
+ * ("Vertex Shader Variables / Output Variables")
+ *
+ * "Each program object can specify a set of output variables from
+ * one shader to be recorded in transform feedback mode (see
+ * section 13.3). The variables that can be recorded are those
+ * emitted by the first active shader, in order, from the
+ * following list:
+ *
+ * * geometry shader
+ * * tessellation evaluation shader
+ * * tessellation control shader
+ * * vertex shader"
+ *
+ * But on OpenGL ES 3.2, section 11.1.2.1 ("Vertex Shader
+ * Variables / Output Variables") tessellation control shader is
+ * not included in the stages list.
+ */
+ if (!prog->IsES || producer->Stage != MESA_SHADER_TESS_CTRL) {
+ g.process(output_var);
+ }
}
ir_variable *const input_var =
diff --git a/src/compiler/glsl/linker.cpp b/src/compiler/glsl/linker.cpp
index 2d76e852f47..0d9b1befdd5 100644
--- a/src/compiler/glsl/linker.cpp
+++ b/src/compiler/glsl/linker.cpp
@@ -1090,7 +1090,7 @@ cross_validate_globals(struct gl_context *ctx, struct gl_shader_program *prog,
}
}
- if (existing->data.invariant != var->data.invariant) {
+ if (existing->data.explicit_invariant != var->data.explicit_invariant) {
linker_error(prog, "declarations for %s `%s' have "
"mismatching invariant qualifiers\n",
mode_string(var), var->name);
diff --git a/src/compiler/glsl/linker.h b/src/compiler/glsl/linker.h
index f6fb00351d4..be92dbf983c 100644
--- a/src/compiler/glsl/linker.h
+++ b/src/compiler/glsl/linker.h
@@ -134,6 +134,26 @@ class program_resource_visitor {
*/
void process(ir_variable *var, bool use_std430_as_default);
+ /**
+ * Begin processing a variable
+ *
+ * Classes that overload this function should call \c ::process from the
+ * base class to start the recursive processing of the variable.
+ *
+ * \param var The variable that is to be processed
+ * \param var_type The glsl_type reference of the variable
+ *
+ * Calls \c ::visit_field for each leaf of the variable.
+ *
+ * \warning
+ * When processing a uniform block, this entry should only be used in cases
+ * where the row / column ordering of matrices in the block does not
+ * matter. For example, enumerating the names of members of the block, but
+ * not for determining the offsets of members.
+ */
+ void process(ir_variable *var, const glsl_type *var_type,
+ bool use_std430_as_default);
+
/**
* Begin processing a variable of a structured type.
*
diff --git a/src/compiler/glsl/list.h b/src/compiler/glsl/list.h
index 59ed766f2e1..979f6fcc539 100644
--- a/src/compiler/glsl/list.h
+++ b/src/compiler/glsl/list.h
@@ -81,6 +81,12 @@ struct exec_node {
* Insert a node in the list after the current node
*/
void insert_after(exec_node *after);
+
+ /**
+ * Insert another list in the list after the current node
+ */
+ void insert_after(struct exec_list *after);
+
/**
* Insert a node in the list before the current node
*/
@@ -507,6 +513,21 @@ exec_list_append(struct exec_list *list, struct exec_list *source)
exec_list_make_empty(source);
}
+static inline void
+exec_node_insert_list_after(struct exec_node *n, struct exec_list *after)
+{
+ if (exec_list_is_empty(after))
+ return;
+
+ after->tail_sentinel.prev->next = n->next;
+ after->head_sentinel.next->prev = n;
+
+ n->next->prev = after->tail_sentinel.prev;
+ n->next = after->head_sentinel.next;
+
+ exec_list_make_empty(after);
+}
+
static inline void
exec_list_prepend(struct exec_list *list, struct exec_list *source)
{
@@ -635,6 +656,11 @@ inline void exec_list::append_list(exec_list *source)
exec_list_append(this, source);
}
+inline void exec_node::insert_after(exec_list *after)
+{
+ exec_node_insert_list_after(this, after);
+}
+
inline void exec_list::prepend_list(exec_list *source)
{
exec_list_prepend(this, source);
diff --git a/src/compiler/glsl/lower_vector_derefs.cpp b/src/compiler/glsl/lower_vector_derefs.cpp
index 6cd9a2d819a..2aae30d8201 100644
--- a/src/compiler/glsl/lower_vector_derefs.cpp
+++ b/src/compiler/glsl/lower_vector_derefs.cpp
@@ -32,8 +32,9 @@ namespace {
class vector_deref_visitor : public ir_rvalue_enter_visitor {
public:
- vector_deref_visitor()
- : progress(false)
+ vector_deref_visitor(void *mem_ctx, gl_shader_stage shader_stage)
+ : progress(false), shader_stage(shader_stage),
+ factory(&factory_instructions, mem_ctx)
{
}
@@ -45,6 +46,9 @@ class vector_deref_visitor : public ir_rvalue_enter_visitor {
virtual ir_visitor_status visit_enter(ir_assignment *ir);
bool progress;
+ gl_shader_stage shader_stage;
+ exec_list factory_instructions;
+ ir_factory factory;
};
} /* anonymous namespace */
@@ -65,13 +69,63 @@ vector_deref_visitor::visit_enter(ir_assignment *ir)
ir_constant *old_index_constant =
deref->array_index->constant_expression_value(mem_ctx);
if (!old_index_constant) {
- ir->rhs = new(mem_ctx) ir_expression(ir_triop_vector_insert,
- new_lhs->type,
- new_lhs->clone(mem_ctx, NULL),
- ir->rhs,
- deref->array_index);
- ir->write_mask = (1 << new_lhs->type->vector_elements) - 1;
- ir->set_lhs(new_lhs);
+ if (shader_stage == MESA_SHADER_TESS_CTRL &&
+ deref->variable_referenced()->data.mode == ir_var_shader_out) {
+ /* Tessellation control shader outputs act as if they have memory
+ * backing them and if we have writes from multiple threads
+ * targeting the same vec4 (this can happen for patch outputs), the
+ * load-vec-store pattern of ir_triop_vector_insert doesn't work.
+ * Instead, we have to lower to a series of conditional write-masked
+ * assignments.
+ */
+ ir_variable *const src_temp =
+ factory.make_temp(ir->rhs->type, "scalar_tmp");
+
+ /* The newly created variable declaration goes before the assignment
+ * because we're going to set it as the new LHS.
+ */
+ ir->insert_before(factory.instructions);
+ ir->set_lhs(new(mem_ctx) ir_dereference_variable(src_temp));
+
+ ir_variable *const arr_index =
+ factory.make_temp(deref->array_index->type, "index_tmp");
+ factory.emit(assign(arr_index, deref->array_index));
+
+ for (unsigned i = 0; i < new_lhs->type->vector_elements; i++) {
+ ir_constant *const cmp_index =
+ ir_constant::zero(factory.mem_ctx, deref->array_index->type);
+ cmp_index->value.u[0] = i;
+
+ ir_rvalue *const lhs_clone = new_lhs->clone(factory.mem_ctx, NULL);
+ ir_dereference_variable *const src_temp_deref =
+ new(mem_ctx) ir_dereference_variable(src_temp);
+
+ if (new_lhs->ir_type != ir_type_swizzle) {
+ assert(lhs_clone->as_dereference());
+ ir_assignment *cond_assign =
+ new(mem_ctx) ir_assignment(lhs_clone->as_dereference(),
+ src_temp_deref,
+ equal(arr_index, cmp_index),
+ WRITEMASK_X << i);
+ factory.emit(cond_assign);
+ } else {
+ ir_assignment *cond_assign =
+ new(mem_ctx) ir_assignment(swizzle(lhs_clone, i, 1),
+ src_temp_deref,
+ equal(arr_index, cmp_index));
+ factory.emit(cond_assign);
+ }
+ }
+ ir->insert_after(factory.instructions);
+ } else {
+ ir->rhs = new(mem_ctx) ir_expression(ir_triop_vector_insert,
+ new_lhs->type,
+ new_lhs->clone(mem_ctx, NULL),
+ ir->rhs,
+ deref->array_index);
+ ir->write_mask = (1 << new_lhs->type->vector_elements) - 1;
+ ir->set_lhs(new_lhs);
+ }
} else if (new_lhs->ir_type != ir_type_swizzle) {
ir->set_lhs(new_lhs);
ir->write_mask = 1 << old_index_constant->get_uint_component(0);
@@ -105,7 +159,7 @@ vector_deref_visitor::handle_rvalue(ir_rvalue **rv)
bool
lower_vector_derefs(gl_linked_shader *shader)
{
- vector_deref_visitor v;
+ vector_deref_visitor v(shader->ir, shader->Stage);
visit_list_elements(&v, shader->ir);
diff --git a/src/compiler/glsl/serialize.cpp b/src/compiler/glsl/serialize.cpp
index fdd99ec59da..ad258f8bcb1 100644
--- a/src/compiler/glsl/serialize.cpp
+++ b/src/compiler/glsl/serialize.cpp
@@ -996,15 +996,14 @@ write_shader_parameters(struct blob *metadata,
struct gl_program_parameter_list *params)
{
blob_write_uint32(metadata, params->NumParameters);
- blob_write_uint32(metadata, params->NumParameterValues);
uint32_t i = 0;
while (i < params->NumParameters) {
struct gl_program_parameter *param = ¶ms->Parameters[i];
-
blob_write_uint32(metadata, param->Type);
blob_write_string(metadata, param->Name);
blob_write_uint32(metadata, param->Size);
+ blob_write_uint32(metadata, param->Padded);
blob_write_uint32(metadata, param->DataType);
blob_write_bytes(metadata, param->StateIndexes,
sizeof(param->StateIndexes));
@@ -1015,9 +1014,6 @@ write_shader_parameters(struct blob *metadata,
blob_write_bytes(metadata, params->ParameterValues,
sizeof(gl_constant_value) * params->NumParameterValues);
- blob_write_bytes(metadata, params->ParameterValueOffset,
- sizeof(uint32_t) * params->NumParameters);
-
blob_write_uint32(metadata, params->StateFlags);
}
@@ -1028,28 +1024,25 @@ read_shader_parameters(struct blob_reader *metadata,
gl_state_index16 state_indexes[STATE_LENGTH];
uint32_t i = 0;
uint32_t num_parameters = blob_read_uint32(metadata);
- uint32_t num_parameters_values = blob_read_uint32(metadata);
_mesa_reserve_parameter_storage(params, num_parameters);
while (i < num_parameters) {
gl_register_file type = (gl_register_file) blob_read_uint32(metadata);
const char *name = blob_read_string(metadata);
unsigned size = blob_read_uint32(metadata);
+ bool padded = blob_read_uint32(metadata);
unsigned data_type = blob_read_uint32(metadata);
blob_copy_bytes(metadata, (uint8_t *) state_indexes,
sizeof(state_indexes));
_mesa_add_parameter(params, type, name, size, data_type,
- NULL, state_indexes, false);
+ NULL, state_indexes, padded);
i++;
}
blob_copy_bytes(metadata, (uint8_t *) params->ParameterValues,
- sizeof(gl_constant_value) * num_parameters_values);
-
- blob_copy_bytes(metadata, (uint8_t *) params->ParameterValueOffset,
- sizeof(uint32_t) * num_parameters);
+ sizeof(gl_constant_value) * params->NumParameterValues);
params->StateFlags = blob_read_uint32(metadata);
}
diff --git a/src/compiler/glsl_types.cpp b/src/compiler/glsl_types.cpp
index 90f4548030f..042f45a926d 100644
--- a/src/compiler/glsl_types.cpp
+++ b/src/compiler/glsl_types.cpp
@@ -260,6 +260,22 @@ glsl_type::contains_double() const
}
}
+bool
+glsl_type::contains_64bit() const
+{
+ if (this->is_array()) {
+ return this->fields.array->contains_64bit();
+ } else if (this->is_record() || this->is_interface()) {
+ for (unsigned int i = 0; i < this->length; i++) {
+ if (this->fields.structure[i].type->contains_64bit())
+ return true;
+ }
+ return false;
+ } else {
+ return this->is_64bit();
+ }
+}
+
bool
glsl_type::contains_opaque() const {
switch (base_type) {
diff --git a/src/compiler/glsl_types.h b/src/compiler/glsl_types.h
index bdaeee7ddd7..4767d197449 100644
--- a/src/compiler/glsl_types.h
+++ b/src/compiler/glsl_types.h
@@ -31,6 +31,7 @@
#include "shader_enums.h"
#include "blob.h"
#include "c11/threads.h"
+#include "util/macros.h"
#ifdef __cplusplus
#include "main/config.h"
@@ -114,6 +115,42 @@ static inline bool glsl_base_type_is_integer(enum glsl_base_type type)
type == GLSL_TYPE_IMAGE;
}
+static inline unsigned int
+glsl_base_type_get_bit_size(const enum glsl_base_type base_type)
+{
+ switch (base_type) {
+ case GLSL_TYPE_BOOL:
+ return 1;
+
+ case GLSL_TYPE_INT:
+ case GLSL_TYPE_UINT:
+ case GLSL_TYPE_FLOAT: /* TODO handle mediump */
+ case GLSL_TYPE_SUBROUTINE:
+ return 32;
+
+ case GLSL_TYPE_FLOAT16:
+ case GLSL_TYPE_UINT16:
+ case GLSL_TYPE_INT16:
+ return 16;
+
+ case GLSL_TYPE_UINT8:
+ case GLSL_TYPE_INT8:
+ return 8;
+
+ case GLSL_TYPE_DOUBLE:
+ case GLSL_TYPE_INT64:
+ case GLSL_TYPE_UINT64:
+ case GLSL_TYPE_IMAGE:
+ case GLSL_TYPE_SAMPLER:
+ return 64;
+
+ default:
+ unreachable("unknown base type");
+ }
+
+ return 0;
+}
+
enum glsl_sampler_dim {
GLSL_SAMPLER_DIM_1D = 0,
GLSL_SAMPLER_DIM_2D,
@@ -544,6 +581,12 @@ struct glsl_type {
*/
bool contains_double() const;
+ /**
+ * Query whether or not type is a 64-bit type, or for struct, interface and
+ * array types, contains a double type.
+ */
+ bool contains_64bit() const;
+
/**
* Query whether or not a type is a float type
*/
diff --git a/src/compiler/nir/meson.build b/src/compiler/nir/meson.build
index 20a26a26255..e6784fcd41f 100644
--- a/src/compiler/nir/meson.build
+++ b/src/compiler/nir/meson.build
@@ -112,6 +112,7 @@ files_libnir = files(
'nir_lower_alu.c',
'nir_lower_alu_to_scalar.c',
'nir_lower_alpha_test.c',
+ 'nir_lower_array_deref_of_vec.c',
'nir_lower_atomics_to_ssbo.c',
'nir_lower_bitmap.c',
'nir_lower_bool_to_float.c',
@@ -133,6 +134,7 @@ files_libnir = files(
'nir_lower_io_arrays_to_elements.c',
'nir_lower_io_to_temporaries.c',
'nir_lower_io_to_scalar.c',
+ 'nir_lower_io_to_vector.c',
'nir_lower_packing.c',
'nir_lower_passthrough_edgeflags.c',
'nir_lower_patch_vertices.c',
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index ff2c41faf27..c43226ba8df 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -2825,7 +2825,7 @@ should_print_nir(void)
static inline void nir_validate_shader(nir_shader *shader, const char *when) { (void) shader; (void)when; }
static inline void nir_metadata_set_validation_flag(nir_shader *shader) { (void) shader; }
static inline void nir_metadata_check_validation_flag(nir_shader *shader) { (void) shader; }
-static inline bool should_skip_nir(const char *pass_name) { return false; }
+static inline bool should_skip_nir(UNUSED const char *pass_name) { return false; }
static inline bool should_clone_nir(void) { return false; }
static inline bool should_serialize_deserialize_nir(void) { return false; }
static inline bool should_print_nir(void) { return false; }
@@ -2910,6 +2910,16 @@ void nir_fixup_deref_modes(nir_shader *shader);
bool nir_lower_global_vars_to_local(nir_shader *shader);
+typedef enum {
+ nir_lower_direct_array_deref_of_vec_load = (1 << 0),
+ nir_lower_indirect_array_deref_of_vec_load = (1 << 1),
+ nir_lower_direct_array_deref_of_vec_store = (1 << 2),
+ nir_lower_indirect_array_deref_of_vec_store = (1 << 3),
+} nir_lower_array_deref_of_vec_options;
+
+bool nir_lower_array_deref_of_vec(nir_shader *shader, nir_variable_mode modes,
+ nir_lower_array_deref_of_vec_options options);
+
bool nir_lower_indirect_derefs(nir_shader *shader, nir_variable_mode modes);
bool nir_lower_locals_to_regs(nir_shader *shader);
@@ -2998,6 +3008,7 @@ void nir_lower_io_arrays_to_elements_no_indirects(nir_shader *shader,
bool outputs_only);
void nir_lower_io_to_scalar(nir_shader *shader, nir_variable_mode mask);
void nir_lower_io_to_scalar_early(nir_shader *shader, nir_variable_mode mask);
+bool nir_lower_io_to_vector(nir_shader *shader, nir_variable_mode mask);
typedef struct nir_lower_subgroups_options {
uint8_t subgroup_size;
@@ -3090,6 +3101,9 @@ typedef struct nir_lower_tex_options {
*/
uint8_t swizzles[32][4];
+ /* Can be used to scale sampled values in range required by the format. */
+ float scale_factors[32];
+
/**
* Bitmap of textures that need srgb to linear conversion. If
* (lower_srgb & (1 << texture_index)) then the rgb (xyz) components
@@ -3138,6 +3152,12 @@ typedef struct nir_lower_tex_options {
*/
bool lower_txd_offset_clamp;
+ /**
+ * If true, lower nir_texop_txd with min_lod to a nir_texop_txl if the
+ * sampler index is not statically determinable to be less than 16.
+ */
+ bool lower_txd_clamp_if_sampler_index_not_lt_16;
+
/**
* If true, apply a .bagr swizzle on tg4 results to handle Broadcom's
* mixed-up tg4 locations.
@@ -3316,7 +3336,7 @@ bool nir_opt_move_comparisons(nir_shader *shader);
bool nir_opt_move_load_ubo(nir_shader *shader);
bool nir_opt_peephole_select(nir_shader *shader, unsigned limit,
- bool indirect_load_ok, bool expensive_alu_ok);
+ bool indirect_load_ok);
bool nir_opt_remove_phis(nir_shader *shader);
diff --git a/src/compiler/nir/nir_builder.h b/src/compiler/nir/nir_builder.h
index 2a36eb3c91b..101bc7ad637 100644
--- a/src/compiler/nir/nir_builder.h
+++ b/src/compiler/nir/nir_builder.h
@@ -560,6 +560,35 @@ nir_channels(nir_builder *b, nir_ssa_def *def, nir_component_mask_t mask)
return nir_swizzle(b, def, swizzle, num_channels, false);
}
+static inline nir_ssa_def *
+_nir_vector_extract_helper(nir_builder *b, nir_ssa_def *vec, nir_ssa_def *c,
+ unsigned start, unsigned end)
+{
+ if (start == end - 1) {
+ return nir_channel(b, vec, start);
+ } else {
+ unsigned mid = start + (end - start) / 2;
+ return nir_bcsel(b, nir_ilt(b, c, nir_imm_int(b, mid)),
+ _nir_vector_extract_helper(b, vec, c, start, mid),
+ _nir_vector_extract_helper(b, vec, c, mid, end));
+ }
+}
+
+static inline nir_ssa_def *
+nir_vector_extract(nir_builder *b, nir_ssa_def *vec, nir_ssa_def *c)
+{
+ nir_src c_src = nir_src_for_ssa(c);
+ if (nir_src_is_const(c_src)) {
+ unsigned c_const = nir_src_as_uint(c_src);
+ if (c_const < vec->num_components)
+ return nir_channel(b, vec, c_const);
+ else
+ return nir_ssa_undef(b, 1, vec->bit_size);
+ } else {
+ return _nir_vector_extract_helper(b, vec, c, 0, vec->num_components);
+ }
+}
+
static inline nir_ssa_def *
nir_i2i(nir_builder *build, nir_ssa_def *x, unsigned dest_bit_size)
{
diff --git a/src/compiler/nir/nir_clone.c b/src/compiler/nir/nir_clone.c
index 557c7d29f53..24bef4f523a 100644
--- a/src/compiler/nir/nir_clone.c
+++ b/src/compiler/nir/nir_clone.c
@@ -151,9 +151,11 @@ nir_variable_clone(const nir_variable *var, nir_shader *shader)
nvar->name = ralloc_strdup(nvar, var->name);
nvar->data = var->data;
nvar->num_state_slots = var->num_state_slots;
- nvar->state_slots = ralloc_array(nvar, nir_state_slot, var->num_state_slots);
- memcpy(nvar->state_slots, var->state_slots,
- var->num_state_slots * sizeof(nir_state_slot));
+ if (var->num_state_slots) {
+ nvar->state_slots = ralloc_array(nvar, nir_state_slot, var->num_state_slots);
+ memcpy(nvar->state_slots, var->state_slots,
+ var->num_state_slots * sizeof(nir_state_slot));
+ }
if (var->constant_initializer) {
nvar->constant_initializer =
nir_constant_clone(var->constant_initializer, nvar);
diff --git a/src/compiler/nir/nir_deref.c b/src/compiler/nir/nir_deref.c
index 2f5fda643ca..1e321a66208 100644
--- a/src/compiler/nir/nir_deref.c
+++ b/src/compiler/nir/nir_deref.c
@@ -215,7 +215,7 @@ nir_build_deref_offset(nir_builder *b, nir_deref_instr *deref,
unsigned field_offset =
struct_type_get_field_offset(parent->type, size_align,
(*p)->strct.index);
- nir_iadd(b, offset, nir_imm_int(b, field_offset));
+ offset = nir_iadd(b, offset, nir_imm_int(b, field_offset));
} else {
unreachable("Unsupported deref type");
}
@@ -574,10 +574,9 @@ nir_rematerialize_derefs_in_use_blocks_impl(nir_function_impl *impl)
_mesa_hash_table_clear(state.cache, NULL);
nir_foreach_instr_safe(instr, block) {
- if (instr->type == nir_instr_type_deref) {
- nir_deref_instr_remove_if_unused(nir_instr_as_deref(instr));
+ if (instr->type == nir_instr_type_deref &&
+ nir_deref_instr_remove_if_unused(nir_instr_as_deref(instr)))
continue;
- }
state.builder.cursor = nir_before_instr(instr);
nir_foreach_src(instr, rematerialize_deref_src, &state);
diff --git a/src/compiler/nir/nir_gather_xfb_info.c b/src/compiler/nir/nir_gather_xfb_info.c
index 96f0ece5e75..f2a2c0e6b99 100644
--- a/src/compiler/nir/nir_gather_xfb_info.c
+++ b/src/compiler/nir/nir_gather_xfb_info.c
@@ -33,7 +33,11 @@ add_var_xfb_outputs(nir_xfb_info *xfb,
unsigned *offset,
const struct glsl_type *type)
{
- if (glsl_type_is_array(type) || glsl_type_is_matrix(type)) {
+ /* If this type contains a 64-bit value, align to 8 bytes */
+ if (glsl_type_contains_64bit(type))
+ *offset = ALIGN_POT(*offset, 8);
+
+ if (glsl_type_is_array_or_matrix(type) && !var->data.compact) {
unsigned length = glsl_get_length(type);
const struct glsl_type *child_type = glsl_get_array_element(type);
for (unsigned i = 0; i < length; i++)
@@ -58,32 +62,43 @@ add_var_xfb_outputs(nir_xfb_info *xfb,
assert(var->data.stream < NIR_MAX_XFB_STREAMS);
xfb->streams_written |= (1 << var->data.stream);
- unsigned comp_slots = glsl_get_component_slots(type);
- unsigned attrib_slots = DIV_ROUND_UP(comp_slots, 4);
- assert(attrib_slots == glsl_count_attribute_slots(type, false));
-
- /* Ensure that we don't have, for instance, a dvec2 with a location_frac
- * of 2 which would make it crass a location boundary even though it
- * fits in a single slot. However, you can have a dvec3 which crosses
- * the slot boundary with a location_frac of 2.
- */
- assert(DIV_ROUND_UP(var->data.location_frac + comp_slots, 4) == attrib_slots);
+ unsigned comp_slots;
+ if (var->data.compact) {
+ /* This only happens for clip/cull which are float arrays */
+ assert(glsl_without_array(type) == glsl_float_type());
+ assert(var->data.location == VARYING_SLOT_CLIP_DIST0 ||
+ var->data.location == VARYING_SLOT_CLIP_DIST1);
+ comp_slots = glsl_get_length(type);
+ } else {
+ comp_slots = glsl_get_component_slots(type);
+
+ unsigned attrib_slots = DIV_ROUND_UP(comp_slots, 4);
+ assert(attrib_slots == glsl_count_attribute_slots(type, false));
+
+ /* Ensure that we don't have, for instance, a dvec2 with a
+ * location_frac of 2 which would make it crass a location boundary
+ * even though it fits in a single slot. However, you can have a
+ * dvec3 which crosses the slot boundary with a location_frac of 2.
+ */
+ assert(DIV_ROUND_UP(var->data.location_frac + comp_slots, 4) ==
+ attrib_slots);
+ }
assert(var->data.location_frac + comp_slots <= 8);
uint8_t comp_mask = ((1 << comp_slots) - 1) << var->data.location_frac;
- assert(attrib_slots <= 2);
- for (unsigned s = 0; s < attrib_slots; s++) {
+ while (comp_mask) {
nir_xfb_output_info *output = &xfb->outputs[xfb->output_count++];
output->buffer = buffer;
- output->offset = *offset + s * 16;
+ output->offset = *offset;
output->location = *location;
- output->component_mask = (comp_mask >> (s * 4)) & 0xf;
+ output->component_mask = comp_mask & 0xf;
+ *offset += util_bitcount(output->component_mask) * 4;
(*location)++;
+ comp_mask >>= 4;
}
- *offset += comp_slots * 4;
}
}
diff --git a/src/compiler/nir/nir_linking_helpers.c b/src/compiler/nir/nir_linking_helpers.c
index aaa4204cce9..764fd6d443e 100644
--- a/src/compiler/nir/nir_linking_helpers.c
+++ b/src/compiler/nir/nir_linking_helpers.c
@@ -59,6 +59,15 @@ get_variable_io_mask(nir_variable *var, gl_shader_stage stage)
return ((1ull << slots) - 1) << location;
}
+static uint8_t
+get_num_components(nir_variable *var)
+{
+ if (glsl_type_is_struct(glsl_without_array(var->type)))
+ return 4;
+
+ return glsl_get_vector_elements(glsl_without_array(var->type));
+}
+
static void
tcs_add_output_reads(nir_shader *shader, uint64_t *read, uint64_t *patches_read)
{
@@ -80,12 +89,14 @@ tcs_add_output_reads(nir_shader *shader, uint64_t *read, uint64_t *patches_read)
continue;
nir_variable *var = nir_deref_instr_get_variable(deref);
- if (var->data.patch) {
- patches_read[var->data.location_frac] |=
- get_variable_io_mask(var, shader->info.stage);
- } else {
- read[var->data.location_frac] |=
- get_variable_io_mask(var, shader->info.stage);
+ for (unsigned i = 0; i < get_num_components(var); i++) {
+ if (var->data.patch) {
+ patches_read[var->data.location_frac + i] |=
+ get_variable_io_mask(var, shader->info.stage);
+ } else {
+ read[var->data.location_frac + i] |=
+ get_variable_io_mask(var, shader->info.stage);
+ }
}
}
}
@@ -161,22 +172,26 @@ nir_remove_unused_varyings(nir_shader *producer, nir_shader *consumer)
uint64_t patches_read[4] = { 0 }, patches_written[4] = { 0 };
nir_foreach_variable(var, &producer->outputs) {
- if (var->data.patch) {
- patches_written[var->data.location_frac] |=
- get_variable_io_mask(var, producer->info.stage);
- } else {
- written[var->data.location_frac] |=
- get_variable_io_mask(var, producer->info.stage);
+ for (unsigned i = 0; i < get_num_components(var); i++) {
+ if (var->data.patch) {
+ patches_written[var->data.location_frac + i] |=
+ get_variable_io_mask(var, producer->info.stage);
+ } else {
+ written[var->data.location_frac + i] |=
+ get_variable_io_mask(var, producer->info.stage);
+ }
}
}
nir_foreach_variable(var, &consumer->inputs) {
- if (var->data.patch) {
- patches_read[var->data.location_frac] |=
- get_variable_io_mask(var, consumer->info.stage);
- } else {
- read[var->data.location_frac] |=
- get_variable_io_mask(var, consumer->info.stage);
+ for (unsigned i = 0; i < get_num_components(var); i++) {
+ if (var->data.patch) {
+ patches_read[var->data.location_frac + i] |=
+ get_variable_io_mask(var, consumer->info.stage);
+ } else {
+ read[var->data.location_frac + i] |=
+ get_variable_io_mask(var, consumer->info.stage);
+ }
}
}
diff --git a/src/compiler/nir/nir_lower_array_deref_of_vec.c b/src/compiler/nir/nir_lower_array_deref_of_vec.c
new file mode 100644
index 00000000000..2a70dd1ddbc
--- /dev/null
+++ b/src/compiler/nir/nir_lower_array_deref_of_vec.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright © 2019 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir.h"
+#include "nir_builder.h"
+
+static void
+build_write_masked_store(nir_builder *b, nir_deref_instr *vec_deref,
+ nir_ssa_def *value, unsigned component)
+{
+ assert(value->num_components == 1);
+ unsigned num_components = glsl_get_components(vec_deref->type);
+ assert(num_components > 1 && num_components <= NIR_MAX_VEC_COMPONENTS);
+
+ nir_ssa_def *u = nir_ssa_undef(b, 1, value->bit_size);
+ nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS];
+ for (unsigned i = 0; i < num_components; i++)
+ comps[i] = (i == component) ? value : u;
+
+ nir_ssa_def *vec = nir_vec(b, comps, num_components);
+ nir_store_deref(b, vec_deref, vec, (1u << component));
+}
+
+static void
+build_write_masked_stores(nir_builder *b, nir_deref_instr *vec_deref,
+ nir_ssa_def *value, nir_ssa_def *index,
+ unsigned start, unsigned end)
+{
+ if (start == end - 1) {
+ build_write_masked_store(b, vec_deref, value, start);
+ } else {
+ unsigned mid = start + (end - start) / 2;
+ nir_push_if(b, nir_ilt(b, index, nir_imm_int(b, mid)));
+ build_write_masked_stores(b, vec_deref, value, index, start, mid);
+ nir_push_else(b, NULL);
+ build_write_masked_stores(b, vec_deref, value, index, mid, end);
+ nir_pop_if(b, NULL);
+ }
+}
+
+static bool
+nir_lower_array_deref_of_vec_impl(nir_function_impl *impl,
+ nir_variable_mode modes,
+ nir_lower_array_deref_of_vec_options options)
+{
+ bool progress = false;
+
+ nir_builder b;
+ nir_builder_init(&b, impl);
+
+ nir_foreach_block(block, impl) {
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+ assert(intrin->intrinsic != nir_intrinsic_copy_deref);
+
+ if (intrin->intrinsic != nir_intrinsic_load_deref &&
+ intrin->intrinsic != nir_intrinsic_interp_deref_at_centroid &&
+ intrin->intrinsic != nir_intrinsic_interp_deref_at_sample &&
+ intrin->intrinsic != nir_intrinsic_interp_deref_at_offset &&
+ intrin->intrinsic != nir_intrinsic_store_deref)
+ continue;
+
+ nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+ if (!(deref->mode & modes))
+ continue;
+
+ /* We only care about array derefs that act on vectors */
+ if (deref->deref_type != nir_deref_type_array)
+ continue;
+
+ nir_deref_instr *vec_deref = nir_deref_instr_parent(deref);
+ if (!glsl_type_is_vector(vec_deref->type))
+ continue;
+
+ assert(intrin->num_components == 1);
+ unsigned num_components = glsl_get_components(vec_deref->type);
+ assert(num_components > 1 && num_components <= NIR_MAX_VEC_COMPONENTS);
+
+ b.cursor = nir_after_instr(&intrin->instr);
+
+ if (intrin->intrinsic == nir_intrinsic_store_deref) {
+ assert(intrin->src[1].is_ssa);
+ nir_ssa_def *value = intrin->src[1].ssa;
+
+ if (nir_src_is_const(deref->arr.index)) {
+ if (!(options & nir_lower_direct_array_deref_of_vec_store))
+ continue;
+
+ unsigned index = nir_src_as_uint(deref->arr.index);
+ /* If index is OOB, we throw the old store away and don't
+ * replace it with anything.
+ */
+ if (index < num_components)
+ build_write_masked_store(&b, vec_deref, value, index);
+ } else {
+ if (!(options & nir_lower_indirect_array_deref_of_vec_store))
+ continue;
+
+ nir_ssa_def *index = nir_ssa_for_src(&b, deref->arr.index, 1);
+ build_write_masked_stores(&b, vec_deref, value, index,
+ 0, num_components);
+ }
+ nir_instr_remove(&intrin->instr);
+
+ progress = true;
+ } else {
+ if (nir_src_is_const(deref->arr.index)) {
+ if (!(options & nir_lower_direct_array_deref_of_vec_load))
+ continue;
+ } else {
+ if (!(options & nir_lower_indirect_array_deref_of_vec_load))
+ continue;
+ }
+
+ /* Turn the load into a vector load */
+ nir_instr_rewrite_src(&intrin->instr, &intrin->src[0],
+ nir_src_for_ssa(&vec_deref->dest.ssa));
+ intrin->dest.ssa.num_components = num_components;
+ intrin->num_components = num_components;
+
+ nir_ssa_def *index = nir_ssa_for_src(&b, deref->arr.index, 1);
+ nir_ssa_def *scalar =
+ nir_vector_extract(&b, &intrin->dest.ssa, index);
+ if (scalar->parent_instr->type == nir_instr_type_ssa_undef) {
+ nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
+ nir_src_for_ssa(scalar));
+ nir_instr_remove(&intrin->instr);
+ } else {
+ nir_ssa_def_rewrite_uses_after(&intrin->dest.ssa,
+ nir_src_for_ssa(scalar),
+ scalar->parent_instr);
+ }
+ progress = true;
+ }
+ }
+ }
+
+ if (progress) {
+ nir_metadata_preserve(impl, nir_metadata_block_index |
+ nir_metadata_dominance);
+ }
+
+ return progress;
+}
+
+/* Lowers away array dereferences on vectors
+ *
+ * These are allowed on certain variable types such as SSBOs and TCS outputs.
+ * However, not everyone can actually handle them everywhere. There are also
+ * cases where we want to lower them for performance reasons.
+ *
+ * This patch assumes that copy_deref instructions have already been lowered.
+ */
+bool
+nir_lower_array_deref_of_vec(nir_shader *shader, nir_variable_mode modes,
+ nir_lower_array_deref_of_vec_options options)
+{
+ bool progress = false;
+
+ nir_foreach_function(function, shader) {
+ if (function->impl &&
+ nir_lower_array_deref_of_vec_impl(function->impl, modes, options))
+ progress = true;
+ }
+
+ return progress;
+}
diff --git a/src/compiler/nir/nir_lower_clip_cull_distance_arrays.c b/src/compiler/nir/nir_lower_clip_cull_distance_arrays.c
index 6e1557ef40d..b7cd7c50b11 100644
--- a/src/compiler/nir/nir_lower_clip_cull_distance_arrays.c
+++ b/src/compiler/nir/nir_lower_clip_cull_distance_arrays.c
@@ -27,10 +27,10 @@
/**
* @file
*
- * This pass combines separate clip and cull distance arrays into a
- * single array that contains both. Clip distances come first, then
- * cull distances. It also populates nir_shader_info with the size
- * of the original arrays so the driver knows which are which.
+ * This pass combines clip and cull distance arrays in separate locations and
+ * colocates them both in VARYING_SLOT_CLIP_DIST0. It does so by maintaining
+ * two arrays but making them compact and using location_frac to stack them on
+ * top of each other.
*/
/**
@@ -56,77 +56,6 @@ get_unwrapped_array_length(nir_shader *nir, nir_variable *var)
return glsl_get_length(type);
}
-/**
- * Update the type of the combined array (including interface block nesting).
- */
-static void
-update_type(nir_variable *var, gl_shader_stage stage, unsigned length)
-{
- const struct glsl_type *type = glsl_array_type(glsl_float_type(), length, 0);
-
- if (nir_is_per_vertex_io(var, stage))
- type = glsl_array_type(type, glsl_get_length(var->type), 0);
-
- var->type = type;
-}
-
-static void
-rewrite_clip_cull_deref(nir_builder *b,
- nir_deref_instr *deref,
- const struct glsl_type *type,
- unsigned tail_offset)
-{
- deref->type = type;
-
- if (glsl_type_is_array(type)) {
- const struct glsl_type *child_type = glsl_get_array_element(type);
- nir_foreach_use(src, &deref->dest.ssa) {
- rewrite_clip_cull_deref(b, nir_instr_as_deref(src->parent_instr),
- child_type, tail_offset);
- }
- } else {
- assert(glsl_type_is_scalar(type));
-
- /* This is the end of the line. Add the tail offset if needed */
- if (tail_offset > 0) {
- b->cursor = nir_before_instr(&deref->instr);
- assert(deref->deref_type == nir_deref_type_array);
- nir_ssa_def *index = nir_iadd(b, deref->arr.index.ssa,
- nir_imm_int(b, tail_offset));
- nir_instr_rewrite_src(&deref->instr, &deref->arr.index,
- nir_src_for_ssa(index));
- }
- }
-}
-
-static void
-rewrite_references(nir_builder *b,
- nir_instr *instr,
- nir_variable *combined,
- unsigned cull_offset)
-{
- if (instr->type != nir_instr_type_deref)
- return;
-
- nir_deref_instr *deref = nir_instr_as_deref(instr);
- if (deref->deref_type != nir_deref_type_var)
- return;
-
- if (deref->var->data.mode != combined->data.mode)
- return;
-
- const unsigned location = deref->var->data.location;
- if (location != VARYING_SLOT_CLIP_DIST0 &&
- location != VARYING_SLOT_CULL_DIST0)
- return;
-
- deref->var = combined;
- if (location == VARYING_SLOT_CULL_DIST0)
- rewrite_clip_cull_deref(b, deref, combined->type, cull_offset);
- else
- rewrite_clip_cull_deref(b, deref, combined->type, 0);
-}
-
static bool
combine_clip_cull(nir_shader *nir,
struct exec_list *vars,
@@ -134,7 +63,6 @@ combine_clip_cull(nir_shader *nir,
{
nir_variable *cull = NULL;
nir_variable *clip = NULL;
- bool progress = false;
nir_foreach_variable(var, vars) {
if (var->data.location == VARYING_SLOT_CLIP_DIST0)
@@ -144,7 +72,9 @@ combine_clip_cull(nir_shader *nir,
cull = var;
}
- /* if the GLSL lowering pass has already run, don't bother repeating */
+ if (!cull && !clip)
+ return false;
+
if (!cull && clip) {
if (!glsl_type_is_array(clip->type))
return false;
@@ -158,50 +88,29 @@ combine_clip_cull(nir_shader *nir,
nir->info.cull_distance_array_size = cull_array_size;
}
- if (clip)
- clip->data.compact = true;
-
- if (cull)
- cull->data.compact = true;
-
- if (cull_array_size > 0) {
- if (clip_array_size == 0) {
- /* No clip distances, just change the cull distance location */
- cull->data.location = VARYING_SLOT_CLIP_DIST0;
- } else {
- /* Turn the ClipDistance array into a combined one */
- update_type(clip, nir->info.stage, clip_array_size + cull_array_size);
-
- /* Rewrite CullDistance to reference the combined array */
- nir_foreach_function(function, nir) {
- if (function->impl) {
- nir_builder b;
- nir_builder_init(&b, function->impl);
-
- nir_foreach_block(block, function->impl) {
- nir_foreach_instr(instr, block) {
- rewrite_references(&b, instr, clip, clip_array_size);
- }
- }
- }
- }
-
- /* Delete the old CullDistance variable */
- exec_node_remove(&cull->node);
- ralloc_free(cull);
- }
+ if (clip) {
+ assert(clip->data.compact);
+ clip->data.how_declared = nir_var_hidden;
+ }
- nir_foreach_function(function, nir) {
- if (function->impl) {
- nir_metadata_preserve(function->impl,
- nir_metadata_block_index |
- nir_metadata_dominance);
- }
+ if (cull) {
+ assert(cull->data.compact);
+ cull->data.how_declared = nir_var_hidden;
+ cull->data.location = VARYING_SLOT_CLIP_DIST0 + clip_array_size / 4;
+ cull->data.location_frac = clip_array_size % 4;
+ }
+
+ nir_foreach_function(function, nir) {
+ if (function->impl) {
+ nir_metadata_preserve(function->impl,
+ nir_metadata_block_index |
+ nir_metadata_dominance |
+ nir_metadata_live_ssa_defs |
+ nir_metadata_loop_analysis);
}
- progress = true;
}
- return progress;
+ return true;
}
bool
diff --git a/src/compiler/nir/nir_lower_io_to_temporaries.c b/src/compiler/nir/nir_lower_io_to_temporaries.c
index 7602637d428..d2b069d3d68 100644
--- a/src/compiler/nir/nir_lower_io_to_temporaries.c
+++ b/src/compiler/nir/nir_lower_io_to_temporaries.c
@@ -85,7 +85,8 @@ emit_output_copies_impl(struct lower_io_state *state, nir_function_impl *impl)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
- if (intrin->intrinsic == nir_intrinsic_emit_vertex) {
+ if (intrin->intrinsic == nir_intrinsic_emit_vertex ||
+ intrin->intrinsic == nir_intrinsic_emit_vertex_with_counter) {
b.cursor = nir_before_instr(&intrin->instr);
emit_copies(&b, &state->shader->outputs, &state->old_outputs);
}
diff --git a/src/compiler/nir/nir_lower_io_to_vector.c b/src/compiler/nir/nir_lower_io_to_vector.c
new file mode 100644
index 00000000000..d979962373d
--- /dev/null
+++ b/src/compiler/nir/nir_lower_io_to_vector.c
@@ -0,0 +1,387 @@
+/*
+ * Copyright © 2019 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir.h"
+#include "nir_builder.h"
+#include "nir_deref.h"
+
+/** @file nir_lower_io_to_vector.c
+ *
+ * Merges compatible input/output variables residing in different components
+ * of the same location. It's expected that further passes such as
+ * nir_lower_io_to_temporaries will combine loads and stores of the merged
+ * variables, producing vector nir_load_input/nir_store_output instructions
+ * when all is said and done.
+ */
+
+static const struct glsl_type *
+resize_array_vec_type(const struct glsl_type *type, unsigned num_components)
+{
+ if (glsl_type_is_array(type)) {
+ const struct glsl_type *arr_elem =
+ resize_array_vec_type(glsl_get_array_element(type), num_components);
+ return glsl_array_type(arr_elem, glsl_get_length(type), 0);
+ } else {
+ assert(glsl_type_is_vector_or_scalar(type));
+ return glsl_vector_type(glsl_get_base_type(type), num_components);
+ }
+}
+
+static bool
+variable_can_rewrite(const nir_variable *var)
+{
+ /* Only touch user defined varyings as these are the only ones we split */
+ if (var->data.location < VARYING_SLOT_VAR0)
+ return false;
+
+ /* Skip complex types we don't split in the first place */
+ if (!glsl_type_is_vector_or_scalar(glsl_without_array(var->type)))
+ return false;
+
+ /* TODO: add 64/16bit support ? */
+ if (glsl_get_bit_size(glsl_without_array(var->type)) != 32)
+ return false;
+
+ return true;
+}
+
+static bool
+variables_can_merge(nir_shader *shader,
+ const nir_variable *a, const nir_variable *b)
+{
+ const struct glsl_type *a_type_tail = a->type;
+ const struct glsl_type *b_type_tail = b->type;
+
+ /* They must have the same array structure */
+ while (glsl_type_is_array(a_type_tail)) {
+ if (!glsl_type_is_array(b_type_tail))
+ return false;
+
+ if (glsl_get_length(a_type_tail) != glsl_get_length(b_type_tail))
+ return false;
+
+ a_type_tail = glsl_get_array_element(a_type_tail);
+ b_type_tail = glsl_get_array_element(b_type_tail);
+ }
+
+ if (!glsl_type_is_vector_or_scalar(a_type_tail) ||
+ !glsl_type_is_vector_or_scalar(b_type_tail))
+ return false;
+
+ if (glsl_get_base_type(a->type) != glsl_get_base_type(b->type))
+ return false;
+
+ assert(a->data.mode == b->data.mode);
+ if (shader->info.stage == MESA_SHADER_FRAGMENT &&
+ a->data.mode == nir_var_shader_in &&
+ a->data.interpolation != b->data.interpolation)
+ return false;
+
+ return true;
+}
+
+static bool
+create_new_io_vars(nir_shader *shader, struct exec_list *io_list,
+ nir_variable *old_vars[MAX_VARYINGS_INCL_PATCH][4],
+ nir_variable *new_vars[MAX_VARYINGS_INCL_PATCH][4])
+{
+ if (exec_list_is_empty(io_list))
+ return false;
+
+ nir_foreach_variable(var, io_list) {
+ if (variable_can_rewrite(var)) {
+ unsigned loc = var->data.location - VARYING_SLOT_VAR0;
+ unsigned frac = var->data.location_frac;
+ old_vars[loc][frac] = var;
+ }
+ }
+
+ bool merged_any_vars = false;
+
+ /* We don't handle combining vars of different type e.g. different array
+ * lengths.
+ */
+ for (unsigned loc = 0; loc < MAX_VARYINGS_INCL_PATCH; loc++) {
+ unsigned frac = 0;
+ while (frac < 4) {
+ nir_variable *first_var = old_vars[loc][frac];
+ if (!first_var) {
+ frac++;
+ continue;
+ }
+
+ int first = frac;
+ bool found_merge = false;
+
+ while (frac < 4) {
+ nir_variable *var = old_vars[loc][frac];
+ if (!var)
+ break;
+
+ if (var != first_var) {
+ if (!variables_can_merge(shader, first_var, var))
+ break;
+
+ found_merge = true;
+ }
+
+ const unsigned num_components =
+ glsl_get_components(glsl_without_array(var->type));
+
+ /* We had better not have any overlapping vars */
+ for (unsigned i = 1; i < num_components; i++)
+ assert(old_vars[loc][frac + i] == NULL);
+
+ frac += num_components;
+ }
+
+ if (!found_merge)
+ continue;
+
+ merged_any_vars = true;
+
+ nir_variable *var = nir_variable_clone(old_vars[loc][first], shader);
+ var->data.location_frac = first;
+ var->type = resize_array_vec_type(var->type, frac - first);
+
+ nir_shader_add_variable(shader, var);
+ for (unsigned i = first; i < frac; i++)
+ new_vars[loc][i] = var;
+ }
+ }
+
+ return merged_any_vars;
+}
+
+static nir_deref_instr *
+build_array_deref_of_new_var(nir_builder *b, nir_variable *new_var,
+ nir_deref_instr *leader)
+{
+ if (leader->deref_type == nir_deref_type_var)
+ return nir_build_deref_var(b, new_var);
+
+ nir_deref_instr *parent =
+ build_array_deref_of_new_var(b, new_var, nir_deref_instr_parent(leader));
+
+ return nir_build_deref_follower(b, parent, leader);
+}
+
+static bool
+nir_lower_io_to_vector_impl(nir_function_impl *impl, nir_variable_mode modes)
+{
+ assert(!(modes & ~(nir_var_shader_in | nir_var_shader_out)));
+
+ nir_builder b;
+ nir_builder_init(&b, impl);
+
+ nir_metadata_require(impl, nir_metadata_dominance);
+
+ nir_shader *shader = impl->function->shader;
+ nir_variable *old_inputs[MAX_VARYINGS_INCL_PATCH][4] = {0};
+ nir_variable *new_inputs[MAX_VARYINGS_INCL_PATCH][4] = {0};
+ nir_variable *old_outputs[MAX_VARYINGS_INCL_PATCH][4] = {0};
+ nir_variable *new_outputs[MAX_VARYINGS_INCL_PATCH][4] = {0};
+
+ if (modes & nir_var_shader_in) {
+ /* Vertex shaders support overlapping inputs. We don't do those */
+ assert(b.shader->info.stage != MESA_SHADER_VERTEX);
+
+ /* If we don't actually merge any variables, remove that bit from modes
+ * so we don't bother doing extra non-work.
+ */
+ if (!create_new_io_vars(shader, &shader->inputs,
+ old_inputs, new_inputs))
+ modes &= ~nir_var_shader_in;
+ }
+
+ if (modes & nir_var_shader_out) {
+ /* Fragment shader outputs are always vec4. You shouldn't have
+ * scalarized them and it doesn't make sense to vectorize them.
+ */
+ assert(b.shader->info.stage != MESA_SHADER_FRAGMENT);
+
+ /* If we don't actually merge any variables, remove that bit from modes
+ * so we don't bother doing extra non-work.
+ */
+ if (!create_new_io_vars(shader, &shader->outputs,
+ old_outputs, new_outputs))
+ modes &= ~nir_var_shader_out;
+ }
+
+ if (!modes)
+ return false;
+
+ bool progress = false;
+
+ /* Actually lower all the IO load/store intrinsics. Load instructions are
+ * lowered to a vector load and an ALU instruction to grab the channels we
+ * want. Outputs are lowered to a write-masked store of the vector output.
+ * For non-TCS outputs, we then run nir_lower_io_to_temporaries at the end
+ * to clean up the partial writes.
+ */
+ nir_foreach_block(block, impl) {
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+ switch (intrin->intrinsic) {
+ case nir_intrinsic_load_deref:
+ case nir_intrinsic_interp_deref_at_centroid:
+ case nir_intrinsic_interp_deref_at_sample:
+ case nir_intrinsic_interp_deref_at_offset: {
+ nir_deref_instr *old_deref = nir_src_as_deref(intrin->src[0]);
+ if (!(old_deref->mode & modes))
+ break;
+
+ if (old_deref->mode == nir_var_shader_out)
+ assert(b.shader->info.stage == MESA_SHADER_TESS_CTRL);
+
+ nir_variable *old_var = nir_deref_instr_get_variable(old_deref);
+ if (old_var->data.location < VARYING_SLOT_VAR0)
+ break;
+
+ const unsigned loc = old_var->data.location - VARYING_SLOT_VAR0;
+ const unsigned old_frac = old_var->data.location_frac;
+ nir_variable *new_var = old_deref->mode == nir_var_shader_in ?
+ new_inputs[loc][old_frac] :
+ new_outputs[loc][old_frac];
+ if (!new_var)
+ break;
+
+ assert(new_var->data.location == VARYING_SLOT_VAR0 + loc);
+ const unsigned new_frac = new_var->data.location_frac;
+
+ nir_component_mask_t vec4_comp_mask =
+ ((1 << intrin->num_components) - 1) << old_frac;
+
+ b.cursor = nir_before_instr(&intrin->instr);
+
+ /* Rewrite the load to use the new variable and only select a
+ * portion of the result.
+ */
+ nir_deref_instr *new_deref =
+ build_array_deref_of_new_var(&b, new_var, old_deref);
+ assert(glsl_type_is_vector(new_deref->type));
+ nir_instr_rewrite_src(&intrin->instr, &intrin->src[0],
+ nir_src_for_ssa(&new_deref->dest.ssa));
+
+ intrin->num_components =
+ glsl_get_components(new_deref->type);
+ intrin->dest.ssa.num_components = intrin->num_components;
+
+ b.cursor = nir_after_instr(&intrin->instr);
+
+ nir_ssa_def *new_vec = nir_channels(&b, &intrin->dest.ssa,
+ vec4_comp_mask >> new_frac);
+ nir_ssa_def_rewrite_uses_after(&intrin->dest.ssa,
+ nir_src_for_ssa(new_vec),
+ new_vec->parent_instr);
+
+ progress = true;
+ break;
+ }
+
+ case nir_intrinsic_store_deref: {
+ nir_deref_instr *old_deref = nir_src_as_deref(intrin->src[0]);
+ if (old_deref->mode != nir_var_shader_out)
+ break;
+
+ nir_variable *old_var = nir_deref_instr_get_variable(old_deref);
+ if (old_var->data.location < VARYING_SLOT_VAR0)
+ break;
+
+ const unsigned loc = old_var->data.location - VARYING_SLOT_VAR0;
+ const unsigned old_frac = old_var->data.location_frac;
+ nir_variable *new_var = new_outputs[loc][old_frac];
+ if (!new_var)
+ break;
+
+ assert(new_var->data.location == VARYING_SLOT_VAR0 + loc);
+ const unsigned new_frac = new_var->data.location_frac;
+
+ b.cursor = nir_before_instr(&intrin->instr);
+
+ /* Rewrite the store to be a masked store to the new variable */
+ nir_deref_instr *new_deref =
+ build_array_deref_of_new_var(&b, new_var, old_deref);
+ assert(glsl_type_is_vector(new_deref->type));
+ nir_instr_rewrite_src(&intrin->instr, &intrin->src[0],
+ nir_src_for_ssa(&new_deref->dest.ssa));
+
+ intrin->num_components =
+ glsl_get_components(new_deref->type);
+
+ nir_component_mask_t old_wrmask = nir_intrinsic_write_mask(intrin);
+
+ assert(intrin->src[1].is_ssa);
+ nir_ssa_def *old_value = intrin->src[1].ssa;
+ nir_ssa_def *comps[4];
+ for (unsigned c = 0; c < intrin->num_components; c++) {
+ if (new_frac + c >= old_frac &&
+ (old_wrmask & 1 << (new_frac + c - old_frac))) {
+ comps[c] = nir_channel(&b, old_value,
+ new_frac + c - old_frac);
+ } else {
+ comps[c] = nir_ssa_undef(&b, old_value->num_components,
+ old_value->bit_size);
+ }
+ }
+ nir_ssa_def *new_value = nir_vec(&b, comps, intrin->num_components);
+ nir_instr_rewrite_src(&intrin->instr, &intrin->src[1],
+ nir_src_for_ssa(new_value));
+
+ nir_intrinsic_set_write_mask(intrin,
+ old_wrmask << (old_frac - new_frac));
+
+ progress = true;
+ break;
+ }
+
+ default:
+ break;
+ }
+ }
+ }
+
+ if (progress) {
+ nir_metadata_preserve(impl, nir_metadata_block_index |
+ nir_metadata_dominance);
+ }
+
+ return progress;
+}
+
+bool
+nir_lower_io_to_vector(nir_shader *shader, nir_variable_mode modes)
+{
+ bool progress = false;
+
+ nir_foreach_function(function, shader) {
+ if (function->impl)
+ progress |= nir_lower_io_to_vector_impl(function->impl, modes);
+ }
+
+ return progress;
+}
diff --git a/src/compiler/nir/nir_lower_tex.c b/src/compiler/nir/nir_lower_tex.c
index a618b86b34c..11afffe3dee 100644
--- a/src/compiler/nir/nir_lower_tex.c
+++ b/src/compiler/nir/nir_lower_tex.c
@@ -306,7 +306,8 @@ lower_implicit_lod(nir_builder *b, nir_tex_instr *tex)
}
static nir_ssa_def *
-sample_plane(nir_builder *b, nir_tex_instr *tex, int plane)
+sample_plane(nir_builder *b, nir_tex_instr *tex, int plane,
+ const nir_lower_tex_options *options)
{
assert(tex->dest.is_ssa);
assert(nir_tex_instr_dest_size(tex) == 4);
@@ -334,6 +335,11 @@ sample_plane(nir_builder *b, nir_tex_instr *tex, int plane)
nir_builder_instr_insert(b, &plane_tex->instr);
+ /* If scaling_factor is set, return a scaled value. */
+ if (options->scale_factors[tex->texture_index])
+ return nir_fmul_imm(b, &plane_tex->dest.ssa,
+ options->scale_factors[tex->texture_index]);
+
return &plane_tex->dest.ssa;
}
@@ -366,12 +372,13 @@ convert_yuv_to_rgb(nir_builder *b, nir_tex_instr *tex,
}
static void
-lower_y_uv_external(nir_builder *b, nir_tex_instr *tex)
+lower_y_uv_external(nir_builder *b, nir_tex_instr *tex,
+ const nir_lower_tex_options *options)
{
b->cursor = nir_after_instr(&tex->instr);
- nir_ssa_def *y = sample_plane(b, tex, 0);
- nir_ssa_def *uv = sample_plane(b, tex, 1);
+ nir_ssa_def *y = sample_plane(b, tex, 0, options);
+ nir_ssa_def *uv = sample_plane(b, tex, 1, options);
convert_yuv_to_rgb(b, tex,
nir_channel(b, y, 0),
@@ -381,13 +388,14 @@ lower_y_uv_external(nir_builder *b, nir_tex_instr *tex)
}
static void
-lower_y_u_v_external(nir_builder *b, nir_tex_instr *tex)
+lower_y_u_v_external(nir_builder *b, nir_tex_instr *tex,
+ const nir_lower_tex_options *options)
{
b->cursor = nir_after_instr(&tex->instr);
- nir_ssa_def *y = sample_plane(b, tex, 0);
- nir_ssa_def *u = sample_plane(b, tex, 1);
- nir_ssa_def *v = sample_plane(b, tex, 2);
+ nir_ssa_def *y = sample_plane(b, tex, 0, options);
+ nir_ssa_def *u = sample_plane(b, tex, 1, options);
+ nir_ssa_def *v = sample_plane(b, tex, 2, options);
convert_yuv_to_rgb(b, tex,
nir_channel(b, y, 0),
@@ -397,12 +405,13 @@ lower_y_u_v_external(nir_builder *b, nir_tex_instr *tex)
}
static void
-lower_yx_xuxv_external(nir_builder *b, nir_tex_instr *tex)
+lower_yx_xuxv_external(nir_builder *b, nir_tex_instr *tex,
+ const nir_lower_tex_options *options)
{
b->cursor = nir_after_instr(&tex->instr);
- nir_ssa_def *y = sample_plane(b, tex, 0);
- nir_ssa_def *xuxv = sample_plane(b, tex, 1);
+ nir_ssa_def *y = sample_plane(b, tex, 0, options);
+ nir_ssa_def *xuxv = sample_plane(b, tex, 1, options);
convert_yuv_to_rgb(b, tex,
nir_channel(b, y, 0),
@@ -412,12 +421,13 @@ lower_yx_xuxv_external(nir_builder *b, nir_tex_instr *tex)
}
static void
-lower_xy_uxvx_external(nir_builder *b, nir_tex_instr *tex)
+lower_xy_uxvx_external(nir_builder *b, nir_tex_instr *tex,
+ const nir_lower_tex_options *options)
{
b->cursor = nir_after_instr(&tex->instr);
- nir_ssa_def *y = sample_plane(b, tex, 0);
- nir_ssa_def *uxvx = sample_plane(b, tex, 1);
+ nir_ssa_def *y = sample_plane(b, tex, 0, options);
+ nir_ssa_def *uxvx = sample_plane(b, tex, 1, options);
convert_yuv_to_rgb(b, tex,
nir_channel(b, y, 1),
@@ -427,11 +437,12 @@ lower_xy_uxvx_external(nir_builder *b, nir_tex_instr *tex)
}
static void
-lower_ayuv_external(nir_builder *b, nir_tex_instr *tex)
+lower_ayuv_external(nir_builder *b, nir_tex_instr *tex,
+ const nir_lower_tex_options *options)
{
b->cursor = nir_after_instr(&tex->instr);
- nir_ssa_def *ayuv = sample_plane(b, tex, 0);
+ nir_ssa_def *ayuv = sample_plane(b, tex, 0, options);
convert_yuv_to_rgb(b, tex,
nir_channel(b, ayuv, 2),
@@ -879,6 +890,25 @@ lower_tex_packing(nir_builder *b, nir_tex_instr *tex,
color->parent_instr);
}
+static bool
+sampler_index_lt(nir_tex_instr *tex, unsigned max)
+{
+ assert(nir_tex_instr_src_index(tex, nir_tex_src_sampler_deref) == -1);
+
+ unsigned sampler_index = tex->sampler_index;
+
+ int sampler_offset_idx =
+ nir_tex_instr_src_index(tex, nir_tex_src_sampler_offset);
+ if (sampler_offset_idx >= 0) {
+ if (!nir_src_is_const(tex->src[sampler_offset_idx].src))
+ return false;
+
+ sampler_index += nir_src_as_uint(tex->src[sampler_offset_idx].src);
+ }
+
+ return sampler_index < max;
+}
+
static bool
nir_lower_tex_block(nir_block *block, nir_builder *b,
const nir_lower_tex_options *options)
@@ -923,27 +953,27 @@ nir_lower_tex_block(nir_block *block, nir_builder *b,
}
if ((1 << tex->texture_index) & options->lower_y_uv_external) {
- lower_y_uv_external(b, tex);
+ lower_y_uv_external(b, tex, options);
progress = true;
}
if ((1 << tex->texture_index) & options->lower_y_u_v_external) {
- lower_y_u_v_external(b, tex);
+ lower_y_u_v_external(b, tex, options);
progress = true;
}
if ((1 << tex->texture_index) & options->lower_yx_xuxv_external) {
- lower_yx_xuxv_external(b, tex);
+ lower_yx_xuxv_external(b, tex, options);
progress = true;
}
if ((1 << tex->texture_index) & options->lower_xy_uxvx_external) {
- lower_xy_uxvx_external(b, tex);
+ lower_xy_uxvx_external(b, tex, options);
progress = true;
}
if ((1 << tex->texture_index) & options->lower_ayuv_external) {
- lower_ayuv_external(b, tex);
+ lower_ayuv_external(b, tex, options);
progress = true;
}
@@ -995,6 +1025,8 @@ nir_lower_tex_block(nir_block *block, nir_builder *b,
(options->lower_txd_shadow && tex->is_shadow) ||
(options->lower_txd_shadow_clamp && tex->is_shadow && has_min_lod) ||
(options->lower_txd_offset_clamp && has_offset && has_min_lod) ||
+ (options->lower_txd_clamp_if_sampler_index_not_lt_16 &&
+ has_min_lod && !sampler_index_lt(tex, 16)) ||
(options->lower_txd_cube_map &&
tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE) ||
(options->lower_txd_3d &&
diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py
index d32005846a6..f52e623ef0f 100644
--- a/src/compiler/nir/nir_opcodes.py
+++ b/src/compiler/nir/nir_opcodes.py
@@ -404,12 +404,21 @@ def unpack_4x8(fmt):
float absX = fabs(src0.x);
float absY = fabs(src0.y);
float absZ = fabs(src0.z);
-if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.y; dst.y = -src0.z; }
-if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = -src0.y; dst.y = src0.z; }
-if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.z; dst.y = src0.x; }
-if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = -src0.z; dst.y = src0.x; }
-if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.y; dst.y = src0.x; }
-if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.y; dst.y = -src0.x; }
+
+float ma = 0.0;
+if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
+if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
+if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
+
+if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
+if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
+if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
+if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
+if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
+if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
+
+dst.x = dst.x / ma + 0.5;
+dst.y = dst.y / ma + 0.5;
""")
unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """
diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py
index 75a3d2ad238..53c842b9ef9 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -618,11 +618,11 @@
# Reassociate constants in add/mul chains so they can be folded together.
# For now, we mostly only handle cases where the constants are separated by
# a single non-constant. We could do better eventually.
- (('~fmul', '#a', ('fmul', b, '#c')), ('fmul', ('fmul', a, c), b)),
- (('imul', '#a', ('imul', b, '#c')), ('imul', ('imul', a, c), b)),
- (('~fadd', '#a', ('fadd', b, '#c')), ('fadd', ('fadd', a, c), b)),
- (('~fadd', '#a', ('fneg', ('fadd', b, '#c'))), ('fadd', ('fadd', a, ('fneg', c)), ('fneg', b))),
- (('iadd', '#a', ('iadd', b, '#c')), ('iadd', ('iadd', a, c), b)),
+ (('~fmul', '#a', ('fmul', 'b(is_not_const)', '#c')), ('fmul', ('fmul', a, c), b)),
+ (('imul', '#a', ('imul', 'b(is_not_const)', '#c')), ('imul', ('imul', a, c), b)),
+ (('~fadd', '#a', ('fadd', 'b(is_not_const)', '#c')), ('fadd', ('fadd', a, c), b)),
+ (('~fadd', '#a', ('fneg', ('fadd', 'b(is_not_const)', '#c'))), ('fadd', ('fadd', a, ('fneg', c)), ('fneg', b))),
+ (('iadd', '#a', ('iadd', 'b(is_not_const)', '#c')), ('iadd', ('iadd', a, c), b)),
# By definition...
(('bcsel', ('ige', ('find_lsb', a), 0), ('find_lsb', a), -1), ('find_lsb', a)),
@@ -929,9 +929,6 @@ def bitfield_reverse(u):
(('fdot4', a, b), ('fdot_replicated4', a, b), 'options->fdot_replicates'),
(('fdph', a, b), ('fdph_replicated', a, b), 'options->fdot_replicates'),
- (('b2f(is_used_more_than_once)', ('inot', 'a@1')), ('bcsel', a, 0.0, 1.0)),
- (('fneg(is_used_more_than_once)', ('b2f', ('inot', 'a@1'))), ('bcsel', a, -0.0, -1.0)),
-
# we do these late so that we don't get in the way of creating ffmas
(('fmin', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmin', a, b))),
(('fmax', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmax', a, b))),
diff --git a/src/compiler/nir/nir_opt_copy_prop_vars.c b/src/compiler/nir/nir_opt_copy_prop_vars.c
index 392fef407cf..a71cce19a80 100644
--- a/src/compiler/nir/nir_opt_copy_prop_vars.c
+++ b/src/compiler/nir/nir_opt_copy_prop_vars.c
@@ -653,7 +653,7 @@ copy_prop_vars_block(struct copy_prop_var_state *state,
struct copy_entry *src_entry =
lookup_entry_for_deref(copies, src, nir_derefs_a_contains_b_bit);
- struct value value;
+ struct value value = {0};
if (try_load_from_entry(state, src_entry, b, intrin, src, &value)) {
if (value.is_ssa) {
/* lookup_load has already ensured that we get a single SSA
diff --git a/src/compiler/nir/nir_opt_idiv_const.c b/src/compiler/nir/nir_opt_idiv_const.c
index 7fa739161ba..3e4b7a42d42 100644
--- a/src/compiler/nir/nir_opt_idiv_const.c
+++ b/src/compiler/nir/nir_opt_idiv_const.c
@@ -65,15 +65,17 @@ build_umod(nir_builder *b, nir_ssa_def *n, uint64_t d)
static nir_ssa_def *
build_idiv(nir_builder *b, nir_ssa_def *n, int64_t d)
{
+ uint64_t abs_d = d < 0 ? -d : d;
+
if (d == 0) {
return nir_imm_intN_t(b, 0, n->bit_size);
} else if (d == 1) {
return n;
} else if (d == -1) {
return nir_ineg(b, n);
- } else if (util_is_power_of_two_or_zero64(d)) {
- uint64_t abs_d = d < 0 ? -d : d;
- nir_ssa_def *uq = nir_ishr(b, n, nir_imm_int(b, util_logbase2_64(abs_d)));
+ } else if (util_is_power_of_two_or_zero64(abs_d)) {
+ nir_ssa_def *uq = nir_ushr(b, nir_iabs(b, n),
+ nir_imm_int(b, util_logbase2_64(abs_d)));
nir_ssa_def *n_neg = nir_ilt(b, n, nir_imm_intN_t(b, 0, n->bit_size));
nir_ssa_def *neg = d < 0 ? nir_inot(b, n_neg) : n_neg;
return nir_bcsel(b, neg, nir_ineg(b, uq), uq);
diff --git a/src/compiler/nir/nir_opt_if.c b/src/compiler/nir/nir_opt_if.c
index c2f945d4d59..ba94807bb20 100644
--- a/src/compiler/nir/nir_opt_if.c
+++ b/src/compiler/nir/nir_opt_if.c
@@ -313,6 +313,13 @@ opt_if_loop_last_continue(nir_loop *loop)
if (!then_ends_in_continue && !else_ends_in_continue)
return false;
+ /* if the block after the if/else is empty we bail, otherwise we might end
+ * up looping forever
+ */
+ if (&nif->cf_node == nir_cf_node_prev(&last_block->cf_node) &&
+ exec_list_is_empty(&last_block->instr_list))
+ return false;
+
/* Move the last block of the loop inside the last if-statement */
nir_cf_list tmp;
nir_cf_extract(&tmp, nir_after_cf_node(if_node),
diff --git a/src/compiler/nir/nir_opt_move_load_ubo.c b/src/compiler/nir/nir_opt_move_load_ubo.c
index a32f1704427..f36a62a5308 100644
--- a/src/compiler/nir/nir_opt_move_load_ubo.c
+++ b/src/compiler/nir/nir_opt_move_load_ubo.c
@@ -91,7 +91,7 @@ move_load_ubo(nir_block *block)
}
}
- return false;
+ return progress;
}
bool
diff --git a/src/compiler/nir/nir_opt_peephole_select.c b/src/compiler/nir/nir_opt_peephole_select.c
index 1deb02a380e..32d337f99dd 100644
--- a/src/compiler/nir/nir_opt_peephole_select.c
+++ b/src/compiler/nir/nir_opt_peephole_select.c
@@ -59,8 +59,7 @@
static bool
block_check_for_allowed_instrs(nir_block *block, unsigned *count,
- bool alu_ok, bool indirect_load_ok,
- bool expensive_alu_ok)
+ bool alu_ok, bool indirect_load_ok)
{
nir_foreach_instr(instr, block) {
switch (instr->type) {
@@ -118,25 +117,6 @@ block_check_for_allowed_instrs(nir_block *block, unsigned *count,
case nir_op_vec3:
case nir_op_vec4:
break;
-
- case nir_op_fcos:
- case nir_op_fdiv:
- case nir_op_fexp2:
- case nir_op_flog2:
- case nir_op_fmod:
- case nir_op_fpow:
- case nir_op_frcp:
- case nir_op_frem:
- case nir_op_frsq:
- case nir_op_fsin:
- case nir_op_idiv:
- case nir_op_irem:
- case nir_op_udiv:
- if (!alu_ok || !expensive_alu_ok)
- return false;
-
- break;
-
default:
if (!alu_ok) {
/* It must be a move-like operation. */
@@ -180,8 +160,7 @@ block_check_for_allowed_instrs(nir_block *block, unsigned *count,
static bool
nir_opt_peephole_select_block(nir_block *block, nir_shader *shader,
- unsigned limit, bool indirect_load_ok,
- bool expensive_alu_ok)
+ unsigned limit, bool indirect_load_ok)
{
if (nir_cf_node_is_first(&block->cf_node))
return false;
@@ -202,9 +181,9 @@ nir_opt_peephole_select_block(nir_block *block, nir_shader *shader,
/* ... and those blocks must only contain "allowed" instructions. */
unsigned count = 0;
if (!block_check_for_allowed_instrs(then_block, &count, limit != 0,
- indirect_load_ok, expensive_alu_ok) ||
+ indirect_load_ok) ||
!block_check_for_allowed_instrs(else_block, &count, limit != 0,
- indirect_load_ok, expensive_alu_ok))
+ indirect_load_ok))
return false;
if (count > limit)
@@ -271,15 +250,14 @@ nir_opt_peephole_select_block(nir_block *block, nir_shader *shader,
static bool
nir_opt_peephole_select_impl(nir_function_impl *impl, unsigned limit,
- bool indirect_load_ok, bool expensive_alu_ok)
+ bool indirect_load_ok)
{
nir_shader *shader = impl->function->shader;
bool progress = false;
nir_foreach_block_safe(block, impl) {
progress |= nir_opt_peephole_select_block(block, shader, limit,
- indirect_load_ok,
- expensive_alu_ok);
+ indirect_load_ok);
}
if (progress) {
@@ -295,15 +273,14 @@ nir_opt_peephole_select_impl(nir_function_impl *impl, unsigned limit,
bool
nir_opt_peephole_select(nir_shader *shader, unsigned limit,
- bool indirect_load_ok, bool expensive_alu_ok)
+ bool indirect_load_ok)
{
bool progress = false;
nir_foreach_function(function, shader) {
if (function->impl)
progress |= nir_opt_peephole_select_impl(function->impl, limit,
- indirect_load_ok,
- expensive_alu_ok);
+ indirect_load_ok);
}
return progress;
diff --git a/src/compiler/nir/nir_print.c b/src/compiler/nir/nir_print.c
index 80bc25fde9a..422249677b7 100644
--- a/src/compiler/nir/nir_print.c
+++ b/src/compiler/nir/nir_print.c
@@ -812,8 +812,8 @@ print_intrinsic_instr(nir_intrinsic_instr *instr, print_state *state)
assert(dim < ARRAY_SIZE(dim_name) && dim_name[dim]);
fprintf(fp, " image_dim=%s", dim_name[dim]);
} else if (idx == NIR_INTRINSIC_IMAGE_ARRAY) {
- bool array = nir_intrinsic_image_dim(instr);
- fprintf(fp, " image_dim=%s", array ? "true" : "false");
+ bool array = nir_intrinsic_image_array(instr);
+ fprintf(fp, " image_array=%s", array ? "true" : "false");
} else if (idx == NIR_INTRINSIC_DESC_TYPE) {
VkDescriptorType desc_type = nir_intrinsic_desc_type(instr);
fprintf(fp, " desc_type=%s", vulkan_descriptor_type_name(desc_type));
diff --git a/src/compiler/nir/nir_repair_ssa.c b/src/compiler/nir/nir_repair_ssa.c
index b4d22d91c27..f182818374d 100644
--- a/src/compiler/nir/nir_repair_ssa.c
+++ b/src/compiler/nir/nir_repair_ssa.c
@@ -77,6 +77,15 @@ repair_ssa_def(nir_ssa_def *def, void *void_state)
}
}
+ nir_foreach_if_use(src, def) {
+ nir_block *block_before_if =
+ nir_cf_node_as_block(nir_cf_node_prev(&src->parent_if->cf_node));
+ if (!nir_block_dominates(def->parent_instr->block, block_before_if)) {
+ is_valid = false;
+ break;
+ }
+ }
+
if (is_valid)
return true;
@@ -98,6 +107,15 @@ repair_ssa_def(nir_ssa_def *def, void *void_state)
}
}
+ nir_foreach_if_use_safe(src, def) {
+ nir_block *block_before_if =
+ nir_cf_node_as_block(nir_cf_node_prev(&src->parent_if->cf_node));
+ if (!nir_block_dominates(def->parent_instr->block, block_before_if)) {
+ nir_if_rewrite_condition(src->parent_if, nir_src_for_ssa(
+ nir_phi_builder_value_get_block_def(val, block_before_if)));
+ }
+ }
+
return true;
}
diff --git a/src/compiler/nir/nir_search_helpers.h b/src/compiler/nir/nir_search_helpers.h
index 89f1cba5c52..1fb450752ad 100644
--- a/src/compiler/nir/nir_search_helpers.h
+++ b/src/compiler/nir/nir_search_helpers.h
@@ -116,22 +116,6 @@ is_not_const(nir_alu_instr *instr, unsigned src, UNUSED unsigned num_components,
return !nir_src_is_const(instr->src[src].src);
}
-static inline bool
-is_used_more_than_once(nir_alu_instr *instr)
-{
- bool zero_if_use = list_empty(&instr->dest.dest.ssa.if_uses);
- bool zero_use = list_empty(&instr->dest.dest.ssa.uses);
-
- if (zero_use && zero_if_use)
- return false;
- else if (zero_use && list_is_singular(&instr->dest.dest.ssa.if_uses))
- return false;
- else if (zero_if_use && list_is_singular(&instr->dest.dest.ssa.uses))
- return false;
-
- return true;
-}
-
static inline bool
is_used_once(nir_alu_instr *instr)
{
diff --git a/src/compiler/nir_types.cpp b/src/compiler/nir_types.cpp
index b4bde5470c0..3a406e99769 100644
--- a/src/compiler/nir_types.cpp
+++ b/src/compiler/nir_types.cpp
@@ -326,6 +326,12 @@ glsl_type_is_integer(const struct glsl_type *type)
return type->is_integer();
}
+bool
+glsl_type_contains_64bit(const struct glsl_type *type)
+{
+ return type->contains_64bit();
+}
+
const glsl_type *
glsl_void_type(void)
{
diff --git a/src/compiler/nir_types.h b/src/compiler/nir_types.h
index 40cddf76374..eb5cdf0a089 100644
--- a/src/compiler/nir_types.h
+++ b/src/compiler/nir_types.h
@@ -97,37 +97,7 @@ unsigned glsl_atomic_size(const struct glsl_type *type);
static inline unsigned
glsl_get_bit_size(const struct glsl_type *type)
{
- switch (glsl_get_base_type(type)) {
- case GLSL_TYPE_BOOL:
- return 1;
-
- case GLSL_TYPE_INT:
- case GLSL_TYPE_UINT:
- case GLSL_TYPE_FLOAT: /* TODO handle mediump */
- case GLSL_TYPE_SUBROUTINE:
- return 32;
-
- case GLSL_TYPE_FLOAT16:
- case GLSL_TYPE_UINT16:
- case GLSL_TYPE_INT16:
- return 16;
-
- case GLSL_TYPE_UINT8:
- case GLSL_TYPE_INT8:
- return 8;
-
- case GLSL_TYPE_DOUBLE:
- case GLSL_TYPE_INT64:
- case GLSL_TYPE_UINT64:
- case GLSL_TYPE_IMAGE:
- case GLSL_TYPE_SAMPLER:
- return 64;
-
- default:
- unreachable("unknown base type");
- }
-
- return 0;
+ return glsl_base_type_get_bit_size(glsl_get_base_type(type));
}
bool glsl_type_is_16bit(const struct glsl_type *type);
@@ -149,6 +119,7 @@ bool glsl_type_is_dual_slot(const struct glsl_type *type);
bool glsl_type_is_numeric(const struct glsl_type *type);
bool glsl_type_is_boolean(const struct glsl_type *type);
bool glsl_type_is_integer(const struct glsl_type *type);
+bool glsl_type_contains_64bit(const struct glsl_type *type);
bool glsl_sampler_type_is_shadow(const struct glsl_type *type);
bool glsl_sampler_type_is_array(const struct glsl_type *type);
bool glsl_contains_atomic(const struct glsl_type *type);
diff --git a/src/compiler/shader_info.h b/src/compiler/shader_info.h
index c3dbe764961..e82f465b256 100644
--- a/src/compiler/shader_info.h
+++ b/src/compiler/shader_info.h
@@ -36,6 +36,8 @@ struct spirv_supported_capabilities {
bool address;
bool atomic_storage;
bool descriptor_array_dynamic_indexing;
+ bool descriptor_array_non_uniform_indexing;
+ bool descriptor_indexing;
bool device_group;
bool draw_parameters;
bool float64;
diff --git a/src/compiler/spirv/spirv_to_nir.c b/src/compiler/spirv/spirv_to_nir.c
index 9bfe5805919..f76cac88f18 100644
--- a/src/compiler/spirv/spirv_to_nir.c
+++ b/src/compiler/spirv/spirv_to_nir.c
@@ -494,6 +494,7 @@ vtn_handle_decoration(struct vtn_builder *b, SpvOp opcode,
break;
case SpvOpDecorate:
+ case SpvOpDecorateId:
case SpvOpMemberDecorate:
case SpvOpDecorateStringGOOGLE:
case SpvOpMemberDecorateStringGOOGLE:
@@ -503,6 +504,7 @@ vtn_handle_decoration(struct vtn_builder *b, SpvOp opcode,
struct vtn_decoration *dec = rzalloc(b, struct vtn_decoration);
switch (opcode) {
case SpvOpDecorate:
+ case SpvOpDecorateId:
case SpvOpDecorateStringGOOGLE:
dec->scope = VTN_DEC_DECORATION;
break;
@@ -2155,6 +2157,7 @@ vtn_handle_texture(struct vtn_builder *b, SpvOp opcode,
case nir_texop_txl:
case nir_texop_txd:
case nir_texop_tg4:
+ case nir_texop_lod:
/* These operations require a sampler */
p->src = nir_src_for_ssa(&sampler->dest.ssa);
p->src_type = nir_tex_src_sampler_deref;
@@ -2163,7 +2166,6 @@ vtn_handle_texture(struct vtn_builder *b, SpvOp opcode,
case nir_texop_txf:
case nir_texop_txf_ms:
case nir_texop_txs:
- case nir_texop_lod:
case nir_texop_query_levels:
case nir_texop_texture_samples:
case nir_texop_samples_identical:
@@ -3045,12 +3047,7 @@ nir_ssa_def *
vtn_vector_extract_dynamic(struct vtn_builder *b, nir_ssa_def *src,
nir_ssa_def *index)
{
- nir_ssa_def *dest = vtn_vector_extract(b, src, 0);
- for (unsigned i = 1; i < src->num_components; i++)
- dest = nir_bcsel(&b->nb, nir_ieq_imm(&b->nb, index, i),
- vtn_vector_extract(b, src, i), dest);
-
- return dest;
+ return nir_vector_extract(&b->nb, src, nir_i2i(&b->nb, index, 32));
}
nir_ssa_def *
@@ -3595,6 +3592,7 @@ vtn_handle_preamble_instruction(struct vtn_builder *b, SpvOp opcode,
case SpvCapabilityInt64Atomics:
spv_check_supported(int64_atomics, cap);
+ break;
case SpvCapabilityInt8:
spv_check_supported(int8, cap);
@@ -3703,12 +3701,26 @@ vtn_handle_preamble_instruction(struct vtn_builder *b, SpvOp opcode,
spv_check_supported(storage_8bit, cap);
break;
+ case SpvCapabilityShaderNonUniformEXT:
+ spv_check_supported(descriptor_indexing, cap);
+ break;
+
case SpvCapabilityInputAttachmentArrayDynamicIndexingEXT:
case SpvCapabilityUniformTexelBufferArrayDynamicIndexingEXT:
case SpvCapabilityStorageTexelBufferArrayDynamicIndexingEXT:
spv_check_supported(descriptor_array_dynamic_indexing, cap);
break;
+ case SpvCapabilityUniformBufferArrayNonUniformIndexingEXT:
+ case SpvCapabilitySampledImageArrayNonUniformIndexingEXT:
+ case SpvCapabilityStorageBufferArrayNonUniformIndexingEXT:
+ case SpvCapabilityStorageImageArrayNonUniformIndexingEXT:
+ case SpvCapabilityInputAttachmentArrayNonUniformIndexingEXT:
+ case SpvCapabilityUniformTexelBufferArrayNonUniformIndexingEXT:
+ case SpvCapabilityStorageTexelBufferArrayNonUniformIndexingEXT:
+ spv_check_supported(descriptor_array_non_uniform_indexing, cap);
+ break;
+
case SpvCapabilityRuntimeDescriptorArrayEXT:
spv_check_supported(runtime_descriptor_array, cap);
break;
@@ -3764,6 +3776,7 @@ vtn_handle_preamble_instruction(struct vtn_builder *b, SpvOp opcode,
case SpvOpExecutionMode:
case SpvOpDecorationGroup:
case SpvOpDecorate:
+ case SpvOpDecorateId:
case SpvOpMemberDecorate:
case SpvOpGroupDecorate:
case SpvOpGroupMemberDecorate:
@@ -3951,6 +3964,7 @@ vtn_handle_variable_or_type_instruction(struct vtn_builder *b, SpvOp opcode,
case SpvOpMemberName:
case SpvOpDecorationGroup:
case SpvOpDecorate:
+ case SpvOpDecorateId:
case SpvOpMemberDecorate:
case SpvOpGroupDecorate:
case SpvOpGroupMemberDecorate:
@@ -4480,20 +4494,35 @@ spirv_to_nir(const uint32_t *words, size_t word_count,
}
} while (progress);
+ vtn_assert(b->entry_point->value_type == vtn_value_type_function);
+ nir_function *entry_point = b->entry_point->func->impl->function;
+ vtn_assert(entry_point);
+
+ entry_point->is_entrypoint = true;
+
+ /* When multiple shader stages exist in the same SPIR-V module, we
+ * generate input and output variables for every stage, in the same
+ * NIR program. These dead variables can be invalid NIR. For example,
+ * TCS outputs must be per-vertex arrays (or decorated 'patch'), while
+ * VS output variables wouldn't be.
+ *
+ * To ensure we have valid NIR, we eliminate any dead inputs and outputs
+ * right away. In order to do so, we must lower any constant initializers
+ * on outputs so nir_remove_dead_variables sees that they're written to.
+ */
+ nir_lower_constant_initializers(b->shader, nir_var_shader_out);
+ nir_remove_dead_variables(b->shader,
+ nir_var_shader_in | nir_var_shader_out);
+
/* We sometimes generate bogus derefs that, while never used, give the
* validator a bit of heartburn. Run dead code to get rid of them.
*/
nir_opt_dce(b->shader);
- vtn_assert(b->entry_point->value_type == vtn_value_type_function);
- nir_function *entry_point = b->entry_point->func->impl->function;
- vtn_assert(entry_point);
-
/* Unparent the shader from the vtn_builder before we delete the builder */
ralloc_steal(NULL, b->shader);
ralloc_free(b);
- entry_point->is_entrypoint = true;
return entry_point;
}
diff --git a/src/compiler/spirv/vtn_variables.c b/src/compiler/spirv/vtn_variables.c
index ecdfd0c735f..fe5340ab8cf 100644
--- a/src/compiler/spirv/vtn_variables.c
+++ b/src/compiler/spirv/vtn_variables.c
@@ -1444,6 +1444,8 @@ apply_var_decoration(struct vtn_builder *b,
switch (builtin) {
case SpvBuiltInTessLevelOuter:
case SpvBuiltInTessLevelInner:
+ case SpvBuiltInClipDistance:
+ case SpvBuiltInCullDistance:
var_data->compact = true;
break;
case SpvBuiltInFragCoord:
@@ -2442,9 +2444,17 @@ vtn_handle_variables(struct vtn_builder *b, SpvOp opcode,
case SpvOpArrayLength: {
struct vtn_pointer *ptr =
vtn_value(b, w[3], vtn_value_type_pointer)->pointer;
+ const uint32_t field = w[4];
- const uint32_t offset = ptr->var->type->offsets[w[4]];
- const uint32_t stride = ptr->var->type->members[w[4]]->stride;
+ vtn_fail_if(ptr->type->base_type != vtn_base_type_struct,
+ "OpArrayLength must take a pointer to a structure type");
+ vtn_fail_if(field != ptr->type->length - 1 ||
+ ptr->type->members[field]->base_type != vtn_base_type_array,
+ "OpArrayLength must reference the last memeber of the "
+ "structure and that must be an array");
+
+ const uint32_t offset = ptr->type->offsets[field];
+ const uint32_t stride = ptr->type->members[field]->stride;
if (!ptr->block_index) {
struct vtn_access_chain chain = {
diff --git a/src/egl/Android.mk b/src/egl/Android.mk
index 42b391e6d86..3c7f1366e34 100644
--- a/src/egl/Android.mk
+++ b/src/egl/Android.mk
@@ -59,11 +59,22 @@ LOCAL_SHARED_LIBRARIES := \
libcutils \
libsync
+ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0)
+LOCAL_C_INCLUDES += \
+ frameworks/native/libs/nativewindow/include \
+ frameworks/native/libs/arect/include
+LOCAL_HEADER_LIBRARIES += libnativebase_headers
+endif
+
ifeq ($(BOARD_USES_DRM_GRALLOC),true)
LOCAL_CFLAGS += -DHAVE_DRM_GRALLOC
LOCAL_SHARED_LIBRARIES += libgralloc_drm
endif
+ifeq ($(strip $(BOARD_USES_GRALLOC1)),true)
+LOCAL_CFLAGS += -DHAVE_GRALLOC1
+endif
+
ifeq ($(filter $(MESA_ANDROID_MAJOR_VERSION), 4 5 6 7),)
LOCAL_SHARED_LIBRARIES += libnativewindow
endif
@@ -81,6 +92,6 @@ endif
LOCAL_MODULE := libGLES_mesa
LOCAL_MODULE_RELATIVE_PATH := egl
-
+LOCAL_CFLAGS += -Wno-error
include $(MESA_COMMON_MK)
include $(BUILD_SHARED_LIBRARY)
diff --git a/src/egl/drivers/dri2/egl_dri2.c b/src/egl/drivers/dri2/egl_dri2.c
index c98b9a5d18a..7f01c3ed8aa 100644
--- a/src/egl/drivers/dri2/egl_dri2.c
+++ b/src/egl/drivers/dri2/egl_dri2.c
@@ -66,6 +66,20 @@
#include "util/u_vector.h"
#include "mapi/glapi/glapi.h"
+/* Additional definitions not yet in the drm_fourcc.h.
+ */
+#ifndef DRM_FORMAT_P010
+#define DRM_FORMAT_P010 fourcc_code('P', '0', '1', '0') /* 2x2 subsampled Cb:Cr plane 10 bits per channel */
+#endif
+
+#ifndef DRM_FORMAT_P012
+#define DRM_FORMAT_P012 fourcc_code('P', '0', '1', '2') /* 2x2 subsampled Cb:Cr plane 12 bits per channel */
+#endif
+
+#ifndef DRM_FORMAT_P016
+#define DRM_FORMAT_P016 fourcc_code('P', '0', '1', '6') /* 2x2 subsampled Cb:Cr plane 16 bits per channel */
+#endif
+
#define NUM_ATTRIBS 12
static void
@@ -199,8 +213,10 @@ dri2_add_config(_EGLDisplay *disp, const __DRIconfig *dri_config, int id,
bind_to_texture_rgb = 0;
bind_to_texture_rgba = 0;
- for (int i = 0; dri2_dpy->core->indexConfigAttrib(dri_config, i, &attrib,
- &value); ++i) {
+ for (int i = 0; i < __DRI_ATTRIB_MAX; ++i) {
+ if (!dri2_dpy->core->indexConfigAttrib(dri_config, i, &attrib, &value))
+ break;
+
switch (attrib) {
case __DRI_ATTRIB_RENDER_TYPE:
if (value & __DRI_ATTRIB_RGBA_BIT)
@@ -1431,6 +1447,37 @@ dri2_surf_update_fence_fd(_EGLContext *ctx,
dri2_surface_set_out_fence_fd(surf, fence_fd);
}
+EGLBoolean
+dri2_create_drawable(struct dri2_egl_display *dri2_dpy,
+ const __DRIconfig *config,
+ struct dri2_egl_surface *dri2_surf)
+{
+ __DRIcreateNewDrawableFunc createNewDrawable;
+ void *loaderPrivate = dri2_surf;
+
+ if (dri2_dpy->image_driver)
+ createNewDrawable = dri2_dpy->image_driver->createNewDrawable;
+ else if (dri2_dpy->dri2)
+ createNewDrawable = dri2_dpy->dri2->createNewDrawable;
+ else if (dri2_dpy->swrast)
+ createNewDrawable = dri2_dpy->swrast->createNewDrawable;
+ else
+ return _eglError(EGL_BAD_ALLOC, "no createNewDrawable");
+
+ /* As always gbm is a bit special.. */
+#ifdef HAVE_DRM_PLATFORM
+ if (dri2_surf->gbm_surf)
+ loaderPrivate = dri2_surf->gbm_surf;
+#endif
+
+ dri2_surf->dri_drawable = (*createNewDrawable)(dri2_dpy->dri_screen,
+ config, loaderPrivate);
+ if (dri2_surf->dri_drawable == NULL)
+ return _eglError(EGL_BAD_ALLOC, "createNewDrawable");
+
+ return EGL_TRUE;
+}
+
/**
* Called via eglMakeCurrent(), drv->API.MakeCurrent().
*/
@@ -2262,6 +2309,9 @@ dri2_num_fourcc_format_planes(EGLint format)
case DRM_FORMAT_NV21:
case DRM_FORMAT_NV16:
case DRM_FORMAT_NV61:
+ case DRM_FORMAT_P010:
+ case DRM_FORMAT_P012:
+ case DRM_FORMAT_P016:
return 2;
case DRM_FORMAT_YUV410:
diff --git a/src/egl/drivers/dri2/egl_dri2.h b/src/egl/drivers/dri2/egl_dri2.h
index a9ddadf11b1..06bf60f60db 100644
--- a/src/egl/drivers/dri2/egl_dri2.h
+++ b/src/egl/drivers/dri2/egl_dri2.h
@@ -69,6 +69,10 @@ struct zwp_linux_dmabuf_v1;
#include
#endif /* HAVE_ANDROID_PLATFORM */
+#ifdef HAVE_GRALLOC1
+#include
+#endif
+
#include "eglconfig.h"
#include "eglcontext.h"
#include "egldevice.h"
@@ -238,7 +242,14 @@ struct dri2_egl_display
#endif
#ifdef HAVE_ANDROID_PLATFORM
- const gralloc_module_t *gralloc;
+ const hw_module_t *gralloc;
+ uint16_t gralloc_version;
+#ifdef HAVE_GRALLOC1
+ gralloc1_device_t *gralloc1_dvc;
+ GRALLOC1_PFN_LOCK_FLEX pfn_lockflex;
+ GRALLOC1_PFN_GET_FORMAT pfn_getFormat;
+ GRALLOC1_PFN_UNLOCK pfn_unlock;
+#endif
#endif
bool is_render_node;
@@ -540,6 +551,11 @@ dri2_init_surface(_EGLSurface *surf, _EGLDisplay *dpy, EGLint type,
void
dri2_fini_surface(_EGLSurface *surf);
+EGLBoolean
+dri2_create_drawable(struct dri2_egl_display *dri2_dpy,
+ const __DRIconfig *config,
+ struct dri2_egl_surface *dri2_surf);
+
static inline uint64_t
combine_u32_into_u64(uint32_t hi, uint32_t lo)
{
diff --git a/src/egl/drivers/dri2/platform_android.c b/src/egl/drivers/dri2/platform_android.c
index 366a9ec14e9..a8ed230769e 100644
--- a/src/egl/drivers/dri2/platform_android.c
+++ b/src/egl/drivers/dri2/platform_android.c
@@ -49,6 +49,8 @@
#define ALIGN(val, align) (((val) + (align) - 1) & ~((align) - 1))
+#define GRALLOC_DRM_GET_FORMAT 1
+
struct droid_yuv_format {
/* Lookup keys */
int native; /* HAL_PIXEL_FORMAT_ */
@@ -59,14 +61,26 @@ struct droid_yuv_format {
int fourcc; /* __DRI_IMAGE_FOURCC_ */
};
+/* This enumeration can be deleted if Android defined it in
+ * system/core/include/system/graphics.h
+ */
+enum {
+ HAL_PIXEL_FORMAT_NV12_Y_TILED_INTEL = 0x100,
+ HAL_PIXEL_FORMAT_NV12 = 0x10F,
+ HAL_PIXEL_FORMAT_P010_INTEL = 0x110
+};
+
/* The following table is used to look up a DRI image FourCC based
* on native format and information contained in android_ycbcr struct. */
static const struct droid_yuv_format droid_yuv_formats[] = {
/* Native format, YCrCb, Chroma step, DRI image FourCC */
{ HAL_PIXEL_FORMAT_YCbCr_420_888, 0, 2, __DRI_IMAGE_FOURCC_NV12 },
+ { HAL_PIXEL_FORMAT_P010_INTEL, 0, 4, __DRI_IMAGE_FOURCC_P010 },
{ HAL_PIXEL_FORMAT_YCbCr_420_888, 0, 1, __DRI_IMAGE_FOURCC_YUV420 },
{ HAL_PIXEL_FORMAT_YCbCr_420_888, 1, 1, __DRI_IMAGE_FOURCC_YVU420 },
{ HAL_PIXEL_FORMAT_YV12, 1, 1, __DRI_IMAGE_FOURCC_YVU420 },
+ { HAL_PIXEL_FORMAT_NV12, 0, 2, __DRI_IMAGE_FOURCC_NV12 },
+ { HAL_PIXEL_FORMAT_NV12_Y_TILED_INTEL, 0, 2, __DRI_IMAGE_FOURCC_NV12 },
/* HACK: See droid_create_image_from_prime_fd() and
* https://issuetracker.google.com/32077885. */
{ HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED, 0, 2, __DRI_IMAGE_FOURCC_NV12 },
@@ -249,6 +263,51 @@ droid_window_dequeue_buffer(struct dri2_egl_surface *dri2_surf)
return EGL_TRUE;
}
+static int
+droid_resolve_format(struct dri2_egl_display *dri2_dpy,
+ struct ANativeWindowBuffer *buf)
+{
+ int format = -1;
+ int ret;
+
+ if (buf->format != HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED)
+ return buf->format;
+#ifdef HAVE_GRALLOC1
+ if(dri2_dpy->gralloc_version == HARDWARE_MODULE_API_VERSION(1, 0)) {
+
+ if (!dri2_dpy->pfn_getFormat) {
+ _eglLog(_EGL_WARNING, "Gralloc does not support getFormat");
+ return -1;
+ }
+ ret = dri2_dpy->pfn_getFormat(dri2_dpy->gralloc1_dvc, buf->handle,
+ &format);
+ if (ret) {
+ _eglLog(_EGL_WARNING, "gralloc->getFormat failed: %d", ret);
+ return -1;
+ }
+ } else {
+#else
+ const gralloc_module_t *gralloc0;
+ gralloc0 = dri2_dpy->gralloc;
+
+ if (!gralloc0->perform) {
+ _eglLog(_EGL_WARNING, "gralloc->perform not supported");
+ return -1;
+ }
+ ret = gralloc0->perform(dri2_dpy->gralloc,
+ GRALLOC_DRM_GET_FORMAT,
+ buf->handle, &format);
+ if (ret){
+ _eglLog(_EGL_WARNING, "gralloc->perform failed with error: %d", ret);
+ return -1;
+ }
+#endif
+#ifdef HAVE_GRALLOC1
+ }
+#endif
+ return format;
+}
+
static EGLBoolean
droid_window_enqueue_buffer(_EGLDisplay *disp, struct dri2_egl_surface *dri2_surf)
{
@@ -335,7 +394,6 @@ droid_create_surface(_EGLDriver *drv, _EGLDisplay *disp, EGLint type,
_EGLConfig *conf, void *native_window,
const EGLint *attrib_list)
{
- __DRIcreateNewDrawableFunc createNewDrawable;
struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
struct dri2_egl_config *dri2_conf = dri2_egl_config(conf);
struct dri2_egl_surface *dri2_surf;
@@ -379,17 +437,8 @@ droid_create_surface(_EGLDriver *drv, _EGLDisplay *disp, EGLint type,
goto cleanup_surface;
}
- if (dri2_dpy->image_driver)
- createNewDrawable = dri2_dpy->image_driver->createNewDrawable;
- else
- createNewDrawable = dri2_dpy->dri2->createNewDrawable;
-
- dri2_surf->dri_drawable = (*createNewDrawable)(dri2_dpy->dri_screen, config,
- dri2_surf);
- if (dri2_surf->dri_drawable == NULL) {
- _eglError(EGL_BAD_ALLOC, "createNewDrawable");
+ if (!dri2_create_drawable(dri2_dpy, config, dri2_surf))
goto cleanup_surface;
- }
if (window) {
window->common.incRef(&window->common);
@@ -463,7 +512,7 @@ droid_swap_interval(_EGLDriver *drv, _EGLDisplay *dpy,
struct dri2_egl_surface *dri2_surf = dri2_egl_surface(surf);
struct ANativeWindow *window = dri2_surf->window;
- if (window->setSwapInterval(window, interval))
+ if (window && window->setSwapInterval(window, interval))
return EGL_FALSE;
surf->SwapInterval = interval;
@@ -664,11 +713,18 @@ droid_query_buffer_age(_EGLDriver *drv,
{
struct dri2_egl_surface *dri2_surf = dri2_egl_surface(surface);
+ /* To avoid blocking other EGL calls, release the display mutex before
+ * we enter droid_window_dequeue_buffer() and re-acquire the mutex upon
+ * return.
+ */
+ mtx_unlock(&disp->Mutex);
if (update_buffers(dri2_surf) < 0) {
_eglError(EGL_BAD_ALLOC, "droid_query_buffer_age");
+ mtx_lock(&disp->Mutex);
return -1;
}
+ mtx_lock(&disp->Mutex);
return dri2_surf->back ? dri2_surf->back->age : 0;
}
@@ -731,6 +787,31 @@ droid_swap_buffers(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *draw)
return EGL_TRUE;
}
+static int get_ycbcr_from_flexlayout(struct android_flex_layout *outFlexLayout, struct android_ycbcr *ycbcr)
+{
+
+ for( int i = 0; i < outFlexLayout->num_planes; i++) {
+ switch(outFlexLayout->planes[i].component){
+ case FLEX_COMPONENT_Y:
+ ycbcr->y = outFlexLayout->planes[i].top_left;
+ ycbcr->ystride = outFlexLayout->planes[i].v_increment;
+ break;
+ case FLEX_COMPONENT_Cb:
+ ycbcr->cb = outFlexLayout->planes[i].top_left;
+ ycbcr->cstride = outFlexLayout->planes[i].v_increment;
+ break;
+ case FLEX_COMPONENT_Cr:
+ ycbcr->cr = outFlexLayout->planes[i].top_left;
+ ycbcr->chroma_step = outFlexLayout->planes[i].h_increment;
+ break;
+ default:
+ _eglLog(_EGL_WARNING,"unknown component 0x%x", __func__, outFlexLayout->planes[i].component);
+ break;
+ }
+ }
+ return 0;
+}
+
#if ANDROID_API_LEVEL >= 23
static EGLBoolean
droid_set_damage_region(_EGLDriver *drv,
@@ -774,30 +855,70 @@ droid_create_image_from_prime_fd_yuv(_EGLDisplay *disp, _EGLContext *ctx,
{
struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
struct android_ycbcr ycbcr;
+#ifdef HAVE_GRALLOC1
+ struct android_flex_layout outFlexLayout;
+ gralloc1_rect_t accessRegion;
+#endif
size_t offsets[3];
size_t pitches[3];
int is_ycrcb;
int fourcc;
int ret;
- if (!dri2_dpy->gralloc->lock_ycbcr) {
- _eglLog(_EGL_WARNING, "Gralloc does not support lock_ycbcr");
+ int format = droid_resolve_format(dri2_dpy, buf);
+ if (format < 0) {
+ _eglError(EGL_BAD_PARAMETER, "eglCreateEGLImageKHR");
return NULL;
}
memset(&ycbcr, 0, sizeof(ycbcr));
- ret = dri2_dpy->gralloc->lock_ycbcr(dri2_dpy->gralloc, buf->handle,
- 0, 0, 0, 0, 0, &ycbcr);
- if (ret) {
- /* HACK: See droid_create_image_from_prime_fd() and
- * https://issuetracker.google.com/32077885.*/
- if (buf->format == HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED)
- return NULL;
-
- _eglLog(_EGL_WARNING, "gralloc->lock_ycbcr failed: %d", ret);
- return NULL;
- }
- dri2_dpy->gralloc->unlock(dri2_dpy->gralloc, buf->handle);
+#ifdef HAVE_GRALLOC1
+ if(dri2_dpy->gralloc_version == HARDWARE_MODULE_API_VERSION(1, 0)) {
+ if (!dri2_dpy->pfn_lockflex) {
+ _eglLog(_EGL_WARNING, "Gralloc does not support lockflex");
+ return NULL;
+ }
+
+ ret = dri2_dpy->pfn_lockflex(dri2_dpy->gralloc1_dvc, buf->handle,
+ 0, 0, &accessRegion, &outFlexLayout, -1);
+ if (ret) {
+ _eglLog(_EGL_WARNING, "gralloc->lockflex failed: %d", ret);
+ return NULL;
+ }
+ ret = get_ycbcr_from_flexlayout(&outFlexLayout, &ycbcr);
+ if (ret) {
+ _eglLog(_EGL_WARNING, "gralloc->lockflex failed: %d", ret);
+ return NULL;
+ }
+ int outReleaseFence = 0;
+ dri2_dpy->pfn_unlock(dri2_dpy->gralloc1_dvc, buf->handle, &outReleaseFence);
+ } else {
+#endif
+ const gralloc_module_t *gralloc0;
+ gralloc0 = dri2_dpy->gralloc;
+
+ if (!gralloc0->lock_ycbcr) {
+ _eglLog(_EGL_WARNING, "Gralloc does not support lock_ycbcr");
+ return NULL;
+ }
+
+ ret = gralloc0->lock_ycbcr(gralloc0, buf->handle,
+ 0, 0, 0, 0, 0, &ycbcr);
+
+ if (ret) {
+ /* HACK: See droid_create_image_from_prime_fd() and
+ * https://issuetracker.google.com/32077885.*/
+ if (buf->format == HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED)
+ return NULL;
+
+ _eglLog(_EGL_WARNING, "gralloc->lock_ycbcr failed: %d", ret);
+ return NULL;
+ }
+
+ gralloc0->unlock(dri2_dpy->gralloc, buf->handle);
+#ifdef HAVE_GRALLOC1
+ }
+#endif
/* When lock_ycbcr's usage argument contains no SW_READ/WRITE flags
* it will return the .y/.cb/.cr pointers based on a NULL pointer,
@@ -822,14 +943,15 @@ droid_create_image_from_prime_fd_yuv(_EGLDisplay *disp, _EGLContext *ctx,
/* .chroma_step is the byte distance between the same chroma channel
* values of subsequent pixels, assumed to be the same for Cb and Cr. */
- fourcc = get_fourcc_yuv(buf->format, is_ycrcb, ycbcr.chroma_step);
+ fourcc = get_fourcc_yuv(format, is_ycrcb, ycbcr.chroma_step);
if (fourcc == -1) {
_eglLog(_EGL_WARNING, "unsupported YUV format, native = %x, is_ycrcb = %d, chroma_step = %d",
- buf->format, is_ycrcb, ycbcr.chroma_step);
+ format, is_ycrcb, ycbcr.chroma_step);
return NULL;
}
- if (ycbcr.chroma_step == 2) {
+ /* FIXME? we should not rely on chroma_step */
+ if (ycbcr.chroma_step == 2 || ycbcr.chroma_step == 4) {
/* Semi-planar Y + CbCr or Y + CrCb format. */
const EGLint attr_list_2plane[] = {
EGL_WIDTH, buf->width,
@@ -871,9 +993,16 @@ static _EGLImage *
droid_create_image_from_prime_fd(_EGLDisplay *disp, _EGLContext *ctx,
struct ANativeWindowBuffer *buf, int fd)
{
+ struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
unsigned int pitch;
- if (is_yuv(buf->format)) {
+ int format = droid_resolve_format(dri2_dpy, buf);
+ if (format < 0) {
+ _eglLog(_EGL_WARNING, "Could not resolve buffer format");
+ return NULL;
+ }
+
+ if (is_yuv(format)) {
_EGLImage *image;
image = droid_create_image_from_prime_fd_yuv(disp, ctx, buf, fd);
@@ -888,13 +1017,13 @@ droid_create_image_from_prime_fd(_EGLDisplay *disp, _EGLContext *ctx,
return image;
}
- const int fourcc = get_fourcc(buf->format);
+ const int fourcc = get_fourcc(format);
if (fourcc == -1) {
_eglError(EGL_BAD_PARAMETER, "eglCreateEGLImageKHR");
return NULL;
}
- pitch = buf->stride * get_format_bpp(buf->format);
+ pitch = buf->stride * get_format_bpp(format);
if (pitch == 0) {
_eglError(EGL_BAD_PARAMETER, "eglCreateEGLImageKHR");
return NULL;
@@ -1530,6 +1659,7 @@ dri2_initialize_android(_EGLDriver *drv, _EGLDisplay *disp)
_EGLDevice *dev;
struct dri2_egl_display *dri2_dpy;
const char *err;
+ hw_device_t *device;
int ret;
/* Not supported yet */
@@ -1547,6 +1677,27 @@ dri2_initialize_android(_EGLDriver *drv, _EGLDisplay *disp)
err = "DRI2: failed to get gralloc module";
goto cleanup;
}
+ dri2_dpy->gralloc_version = dri2_dpy->gralloc->module_api_version;
+#ifdef HAVE_GRALLOC1
+ if (dri2_dpy->gralloc_version == HARDWARE_MODULE_API_VERSION(1, 0)) {
+ ret = dri2_dpy->gralloc->methods->open(dri2_dpy->gralloc, GRALLOC_HARDWARE_MODULE_ID, &device);
+ if (ret) {
+ err = "Failed to open hw_device device";
+ goto cleanup;
+ } else {
+ dri2_dpy->gralloc1_dvc = (gralloc1_device_t *)device;
+
+ dri2_dpy->pfn_lockflex = (GRALLOC1_PFN_LOCK_FLEX)\
+ dri2_dpy->gralloc1_dvc->getFunction(dri2_dpy->gralloc1_dvc, GRALLOC1_FUNCTION_LOCK_FLEX);
+
+ dri2_dpy->pfn_getFormat = (GRALLOC1_PFN_GET_FORMAT)\
+ dri2_dpy->gralloc1_dvc->getFunction(dri2_dpy->gralloc1_dvc, GRALLOC1_FUNCTION_GET_FORMAT);
+
+ dri2_dpy->pfn_unlock = (GRALLOC1_PFN_UNLOCK)\
+ dri2_dpy->gralloc1_dvc->getFunction(dri2_dpy->gralloc1_dvc, GRALLOC1_FUNCTION_UNLOCK);
+ }
+ }
+#endif
disp->DriverData = (void *) dri2_dpy;
diff --git a/src/egl/drivers/dri2/platform_drm.c b/src/egl/drivers/dri2/platform_drm.c
index c1ab1c9b0f6..ec66ac3866e 100644
--- a/src/egl/drivers/dri2/platform_drm.c
+++ b/src/egl/drivers/dri2/platform_drm.c
@@ -171,23 +171,8 @@ dri2_drm_create_window_surface(_EGLDriver *drv, _EGLDisplay *disp,
dri2_surf->base.Height = surf->base.height;
surf->dri_private = dri2_surf;
- if (dri2_dpy->dri2) {
- dri2_surf->dri_drawable =
- dri2_dpy->dri2->createNewDrawable(dri2_dpy->dri_screen, config,
- dri2_surf->gbm_surf);
-
- } else {
- assert(dri2_dpy->swrast != NULL);
-
- dri2_surf->dri_drawable =
- dri2_dpy->swrast->createNewDrawable(dri2_dpy->dri_screen, config,
- dri2_surf->gbm_surf);
-
- }
- if (dri2_surf->dri_drawable == NULL) {
- _eglError(EGL_BAD_ALLOC, "createNewDrawable()");
+ if (!dri2_create_drawable(dri2_dpy, config, dri2_surf))
goto cleanup_surf;
- }
return &dri2_surf->base;
diff --git a/src/egl/drivers/dri2/platform_surfaceless.c b/src/egl/drivers/dri2/platform_surfaceless.c
index f9809561611..a13f3805478 100644
--- a/src/egl/drivers/dri2/platform_surfaceless.c
+++ b/src/egl/drivers/dri2/platform_surfaceless.c
@@ -135,13 +135,8 @@ dri2_surfaceless_create_surface(_EGLDriver *drv, _EGLDisplay *disp, EGLint type,
goto cleanup_surface;
}
- dri2_surf->dri_drawable =
- dri2_dpy->image_driver->createNewDrawable(dri2_dpy->dri_screen, config,
- dri2_surf);
- if (dri2_surf->dri_drawable == NULL) {
- _eglError(EGL_BAD_ALLOC, "image->createNewDrawable");
+ if (!dri2_create_drawable(dri2_dpy, config, dri2_surf))
goto cleanup_surface;
- }
if (conf->RedSize == 5)
dri2_surf->visual = __DRI_IMAGE_FORMAT_RGB565;
diff --git a/src/egl/drivers/dri2/platform_wayland.c b/src/egl/drivers/dri2/platform_wayland.c
index c3ca1b6f7bc..e647ed63192 100644
--- a/src/egl/drivers/dri2/platform_wayland.c
+++ b/src/egl/drivers/dri2/platform_wayland.c
@@ -272,7 +272,6 @@ dri2_wl_create_window_surface(_EGLDriver *drv, _EGLDisplay *disp,
_EGLConfig *conf, void *native_window,
const EGLint *attrib_list)
{
- __DRIcreateNewDrawableFunc createNewDrawable;
struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
struct dri2_egl_config *dri2_conf = dri2_egl_config(conf);
struct wl_egl_window *window = native_window;
@@ -349,19 +348,8 @@ dri2_wl_create_window_surface(_EGLDriver *drv, _EGLDisplay *disp,
if (dri2_dpy->flush)
dri2_surf->wl_win->resize_callback = resize_callback;
- if (dri2_dpy->image_driver)
- createNewDrawable = dri2_dpy->image_driver->createNewDrawable;
- else if (dri2_dpy->dri2)
- createNewDrawable = dri2_dpy->dri2->createNewDrawable;
- else
- createNewDrawable = dri2_dpy->swrast->createNewDrawable;
-
- dri2_surf->dri_drawable = (*createNewDrawable)(dri2_dpy->dri_screen, config,
- dri2_surf);
- if (dri2_surf->dri_drawable == NULL) {
- _eglError(EGL_BAD_ALLOC, "createNewDrawable");
+ if (!dri2_create_drawable(dri2_dpy, config, dri2_surf))
goto cleanup_surf_wrapper;
- }
dri2_surf->base.SwapInterval = dri2_dpy->default_swap_interval;
@@ -637,10 +625,8 @@ update_buffers(struct dri2_egl_surface *dri2_surf)
struct dri2_egl_display *dri2_dpy =
dri2_egl_display(dri2_surf->base.Resource.Display);
- if (dri2_surf->base.Width != dri2_surf->wl_win->attached_width ||
- dri2_surf->base.Height != dri2_surf->wl_win->attached_height) {
-
- dri2_wl_release_buffers(dri2_surf);
+ if (dri2_surf->base.Width != dri2_surf->wl_win->width ||
+ dri2_surf->base.Height != dri2_surf->wl_win->height) {
dri2_surf->base.Width = dri2_surf->wl_win->width;
dri2_surf->base.Height = dri2_surf->wl_win->height;
@@ -648,6 +634,11 @@ update_buffers(struct dri2_egl_surface *dri2_surf)
dri2_surf->dy = dri2_surf->wl_win->dy;
}
+ if (dri2_surf->base.Width != dri2_surf->wl_win->attached_width ||
+ dri2_surf->base.Height != dri2_surf->wl_win->attached_height) {
+ dri2_wl_release_buffers(dri2_surf);
+ }
+
if (get_back_bo(dri2_surf) < 0) {
_eglError(EGL_BAD_ALLOC, "failed to allocate color buffer");
return -1;
diff --git a/src/egl/drivers/dri2/platform_x11.c b/src/egl/drivers/dri2/platform_x11.c
index 4684c9f0825..aa1e6cfc498 100644
--- a/src/egl/drivers/dri2/platform_x11.c
+++ b/src/egl/drivers/dri2/platform_x11.c
@@ -289,21 +289,8 @@ dri2_x11_create_surface(_EGLDriver *drv, _EGLDisplay *disp, EGLint type,
goto cleanup_pixmap;
}
- if (dri2_dpy->dri2) {
- dri2_surf->dri_drawable =
- dri2_dpy->dri2->createNewDrawable(dri2_dpy->dri_screen, config,
- dri2_surf);
- } else {
- assert(dri2_dpy->swrast);
- dri2_surf->dri_drawable =
- dri2_dpy->swrast->createNewDrawable(dri2_dpy->dri_screen, config,
- dri2_surf);
- }
-
- if (dri2_surf->dri_drawable == NULL) {
- _eglError(EGL_BAD_ALLOC, "dri2->createNewDrawable");
+ if (!dri2_create_drawable(dri2_dpy, config, dri2_surf))
goto cleanup_pixmap;
- }
if (type != EGL_PBUFFER_BIT) {
cookie = xcb_get_geometry (dri2_dpy->conn, dri2_surf->drawable);
diff --git a/src/egl/main/eglcontext.c b/src/egl/main/eglcontext.c
index 6b241a524ec..318a85a4f88 100644
--- a/src/egl/main/eglcontext.c
+++ b/src/egl/main/eglcontext.c
@@ -178,9 +178,12 @@ _eglParseContextAttribList(_EGLContext *ctx, _EGLDisplay *dpy,
* is supported for OpenGL contexts, and requesting a
* forward-compatible context for OpenGL versions less than 3.0
* will generate an error."
+ *
+ * Note: since the forward-compatible flag can be set more than one way,
+ * the OpenGL version check is performed once, below.
*/
if ((val & EGL_CONTEXT_OPENGL_FORWARD_COMPATIBLE_BIT_KHR) &&
- (api != EGL_OPENGL_API || ctx->ClientMajorVersion < 3)) {
+ api != EGL_OPENGL_API) {
err = EGL_BAD_ATTRIBUTE;
break;
}
diff --git a/src/egl/main/eglcurrent.c b/src/egl/main/eglcurrent.c
index 479f231fb8f..d20ec64e654 100644
--- a/src/egl/main/eglcurrent.c
+++ b/src/egl/main/eglcurrent.c
@@ -137,13 +137,37 @@ _eglDestroyThreadInfo(_EGLThreadInfo *t)
}
+/**
+ * Delete/free a _EGLThreadInfo object.
+ */
+static void
+_eglDestroyThreadInfoCallback(_EGLThreadInfo *t)
+{
+ /* If this callback is called on thread termination then try to also give a
+ * chance to cleanup to the client drivers. If called for module termination
+ * then just release the thread information as calling eglReleaseThread
+ * would result in a deadlock.
+ */
+ if (_egl_TSDInitialized) {
+ /* The callback handler has replaced the TLS entry, which is passed in as
+ * 't', with NULL. Restore it here so that the release thread finds it in
+ * the TLS entry.
+ */
+ _eglSetTSD(t);
+ eglReleaseThread();
+ } else {
+ _eglDestroyThreadInfo(t);
+ }
+}
+
+
/**
* Make sure TSD is initialized and return current value.
*/
static inline _EGLThreadInfo *
_eglCheckedGetTSD(void)
{
- if (_eglInitTSD(&_eglDestroyThreadInfo) != EGL_TRUE) {
+ if (_eglInitTSD(&_eglDestroyThreadInfoCallback) != EGL_TRUE) {
_eglLog(_EGL_FATAL, "failed to initialize \"current\" system");
return NULL;
}
diff --git a/src/egl/main/egldevice.c b/src/egl/main/egldevice.c
index 4878039be0e..c5c9a21273a 100644
--- a/src/egl/main/egldevice.c
+++ b/src/egl/main/egldevice.c
@@ -202,18 +202,6 @@ _eglDeviceSupports(_EGLDevice *dev, _EGLDeviceExtension ext)
};
}
-/* Ideally we'll have an extension which passes the render node,
- * instead of the card one + magic.
- *
- * Then we can move this in _eglQueryDeviceStringEXT below. Until then
- * keep it separate.
- */
-const char *
-_eglGetDRMDeviceRenderNode(_EGLDevice *dev)
-{
- return dev->device->nodes[DRM_NODE_RENDER];
-}
-
EGLBoolean
_eglQueryDeviceAttribEXT(_EGLDevice *dev, EGLint attribute,
EGLAttrib *value)
diff --git a/src/egl/main/egldevice.h b/src/egl/main/egldevice.h
index 83a47d5eacc..883f96f8e30 100644
--- a/src/egl/main/egldevice.h
+++ b/src/egl/main/egldevice.h
@@ -68,9 +68,6 @@ typedef enum _egl_device_extension _EGLDeviceExtension;
EGLBoolean
_eglDeviceSupports(_EGLDevice *dev, _EGLDeviceExtension ext);
-const char *
-_eglGetDRMDeviceRenderNode(_EGLDevice *dev);
-
EGLBoolean
_eglQueryDeviceAttribEXT(_EGLDevice *dev, EGLint attribute,
EGLAttrib *value);
diff --git a/src/egl/meson.build b/src/egl/meson.build
index a23cc36fc2b..b7ff09e9fed 100644
--- a/src/egl/meson.build
+++ b/src/egl/meson.build
@@ -93,10 +93,11 @@ if with_dri2
'drivers/dri2/egl_dri2.h',
'drivers/dri2/egl_dri2_fallbacks.h',
)
+ link_for_egl += [libloader, libxmlconfig]
+ incs_for_egl += inc_loader
if with_platform_x11
files_egl += files('drivers/dri2/platform_x11.c')
- incs_for_egl += inc_loader
if with_dri3
files_egl += files('drivers/dri2/platform_x11_dri3.c')
link_for_egl += libloader_dri3_helper
@@ -105,13 +106,12 @@ if with_dri2
endif
if with_platform_drm
files_egl += files('drivers/dri2/platform_drm.c')
- link_for_egl += [libloader, libgbm, libxmlconfig]
- incs_for_egl += [inc_loader, inc_gbm, include_directories('../gbm/main')]
+ link_for_egl += libgbm
+ incs_for_egl += [inc_gbm, include_directories('../gbm/main')]
deps_for_egl += dep_libdrm
endif
if with_platform_surfaceless
files_egl += files('drivers/dri2/platform_surfaceless.c')
- incs_for_egl += [inc_loader]
endif
if with_platform_wayland
deps_for_egl += [dep_wayland_client, dep_wayland_server, dep_wayland_egl_headers]
@@ -127,7 +127,6 @@ if with_dri2
if with_platform_android
deps_for_egl += dep_android
files_egl += files('drivers/dri2/platform_android.c')
- incs_for_egl += [inc_loader]
endif
elif with_platform_haiku
incs_for_egl += inc_haikugl
@@ -166,7 +165,7 @@ libegl = shared_library(
'-D_EGL_NATIVE_PLATFORM=_EGL_PLATFORM_@0@'.format(egl_native_platform.to_upper()),
],
include_directories : incs_for_egl,
- link_with : [link_for_egl, libloader, libxmlconfig, libglapi, libmesa_util],
+ link_with : [link_for_egl, libglapi, libmesa_util],
link_args : [ld_args_bsymbolic, ld_args_gc_sections],
dependencies : [deps_for_egl, dep_dl, dep_libdrm, dep_clock, dep_thread],
install : true,
diff --git a/src/freedreno/Makefile.am b/src/freedreno/Makefile.am
index 460fb87fb46..342f03d644c 100644
--- a/src/freedreno/Makefile.am
+++ b/src/freedreno/Makefile.am
@@ -45,6 +45,7 @@ TESTS =
BUILT_SOURCES =
CLEANFILES =
EXTRA_DIST = \
+ meson.build \
drm/meson.build \
ir3/ir3_nir_trig.py \
ir3/meson.build
diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c
index fd641735620..fc882c2d2d1 100644
--- a/src/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@@ -686,8 +686,8 @@ emit_intrinsic_load_ubo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
base_lo = create_uniform(b, ubo + (src0->regs[1]->iim_val * ptrsz));
base_hi = create_uniform(b, ubo + (src0->regs[1]->iim_val * ptrsz) + 1);
} else {
- base_lo = create_uniform_indirect(b, ubo, ir3_get_addr(ctx, src0, 4));
- base_hi = create_uniform_indirect(b, ubo + 1, ir3_get_addr(ctx, src0, 4));
+ base_lo = create_uniform_indirect(b, ubo, ir3_get_addr(ctx, src0, ptrsz));
+ base_hi = create_uniform_indirect(b, ubo + 1, ir3_get_addr(ctx, src0, ptrsz));
}
/* note: on 32bit gpu's base_hi is ignored and DCE'd */
@@ -1511,6 +1511,7 @@ emit_intrinsic_barrier(struct ir3_context *ctx, nir_intrinsic_instr *intr)
barrier->cat7.g = true;
barrier->cat7.r = true;
barrier->cat7.w = true;
+ barrier->cat7.l = true;
barrier->barrier_class = IR3_BARRIER_IMAGE_W |
IR3_BARRIER_BUFFER_W;
barrier->barrier_conflict =
diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c
index d9fcf798b3d..68926c9553b 100644
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -97,7 +97,7 @@ ir3_optimize_loop(nir_shader *s)
progress |= OPT(s, nir_opt_gcm, true);
else if (gcm == 2)
progress |= OPT(s, nir_opt_gcm, false);
- progress |= OPT(s, nir_opt_peephole_select, 16, true, true);
+ progress |= OPT(s, nir_opt_peephole_select, 16, true);
progress |= OPT(s, nir_opt_intrinsics);
progress |= OPT(s, nir_opt_algebraic);
progress |= OPT(s, nir_opt_constant_folding);
diff --git a/src/gallium/auxiliary/Android.mk b/src/gallium/auxiliary/Android.mk
index 7618c6fcd93..fe976501451 100644
--- a/src/gallium/auxiliary/Android.mk
+++ b/src/gallium/auxiliary/Android.mk
@@ -32,8 +32,11 @@ LOCAL_SRC_FILES := \
$(C_SOURCES) \
$(NIR_SOURCES) \
$(RENDERONLY_SOURCES) \
- $(VL_STUB_SOURCES) \
- util/u_debug_stack_android.cpp
+ $(VL_STUB_SOURCES)
+
+ifeq ($(USE_LIBBACKTRACE),true)
+ LOCAL_SRC_FILES += util/u_debug_stack_android.cpp
+endif
LOCAL_C_INCLUDES := \
$(GALLIUM_TOP)/auxiliary/util \
diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
index 3fc096789c0..f8c69585e6a 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -950,6 +950,8 @@ draw_set_mapped_so_targets(struct draw_context *draw,
{
int i;
+ draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+
for (i = 0; i < num_targets; i++)
draw->so.targets[i] = targets[i];
for (i = num_targets; i < PIPE_MAX_SO_BUFFERS; i++)
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
index fcbdd5050fe..f307c26d4f7 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
@@ -556,11 +556,11 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
llvm::SmallVector MAttrs;
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-#if HAVE_LLVM >= 0x0400
- /* llvm-3.7+ implements sys::getHostCPUFeatures for x86,
- * which allows us to enable/disable code generation based
- * on the results of cpuid.
+#if HAVE_LLVM >= 0x0400 && (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) || defined(PIPE_ARCH_ARM))
+ /* llvm-3.3+ implements sys::getHostCPUFeatures for Arm
+ * and llvm-3.7+ for x86, which allows us to enable/disable
+ * code generation based on the results of cpuid on these
+ * architectures.
*/
llvm::StringMap features;
llvm::sys::getHostCPUFeatures(features);
@@ -570,7 +570,7 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
++f) {
MAttrs.push_back(((*f).second ? "+" : "-") + (*f).first().str());
}
-#else
+#elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
/*
* We need to unset attributes because sometimes LLVM mistakenly assumes
* certain features are present given the processor name.
@@ -625,6 +625,12 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
MAttrs.push_back("-avx512vl");
#endif
#endif
+#if defined(PIPE_ARCH_ARM)
+ if (!util_cpu_caps.has_neon) {
+ MAttrs.push_back("-neon");
+ MAttrs.push_back("-crypto");
+ MAttrs.push_back("-vfp2");
+ }
#endif
#if defined(PIPE_ARCH_PPC)
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index 0f5b3d9acb7..d6af1d84471 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -1108,7 +1108,7 @@ get_indirect_index(struct lp_build_tgsi_soa_context *bld,
* larger than the declared size but smaller than the buffer size.
*/
if (reg_file != TGSI_FILE_CONSTANT) {
- assert(index_limit > 0);
+ assert(index_limit >= 0);
max_index = lp_build_const_int_vec(bld->bld_base.base.gallivm,
uint_bld->type, index_limit);
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h
index 64f2598a259..09eac4da95a 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h
@@ -220,7 +220,9 @@ tgsi_is_bindless_image_file(unsigned file)
{
return file != TGSI_FILE_IMAGE &&
file != TGSI_FILE_MEMORY &&
- file != TGSI_FILE_BUFFER;
+ file != TGSI_FILE_BUFFER &&
+ file != TGSI_FILE_CONSTBUF &&
+ file != TGSI_FILE_HW_ATOMIC;
}
#ifdef __cplusplus
diff --git a/src/gallium/auxiliary/util/u_blitter.c b/src/gallium/auxiliary/util/u_blitter.c
index e19fde9873d..3dc49cd0958 100644
--- a/src/gallium/auxiliary/util/u_blitter.c
+++ b/src/gallium/auxiliary/util/u_blitter.c
@@ -2059,7 +2059,8 @@ void util_blitter_generate_mipmap(struct blitter_context *blitter,
target = PIPE_TEXTURE_2D_ARRAY;
assert(tex->nr_samples <= 1);
- assert(!util_format_has_stencil(desc));
+ /* Disallow stencil formats without depth. */
+ assert(!util_format_has_stencil(desc) || util_format_has_depth(desc));
is_depth = desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS;
diff --git a/src/gallium/auxiliary/util/u_screen.c b/src/gallium/auxiliary/util/u_screen.c
index 464d9dddc7f..66c17f17007 100644
--- a/src/gallium/auxiliary/util/u_screen.c
+++ b/src/gallium/auxiliary/util/u_screen.c
@@ -337,6 +337,9 @@ u_pipe_screen_get_param_defaults(struct pipe_screen *pscreen,
case PIPE_CAP_DEST_SURFACE_SRGB_CONTROL:
return 1;
+ case PIPE_CAP_MAX_VARYINGS:
+ return 8;
+
default:
unreachable("bad PIPE_CAP_*");
}
diff --git a/src/gallium/auxiliary/util/u_screen.h b/src/gallium/auxiliary/util/u_screen.h
index 65afb277ef3..3952a11f2ca 100644
--- a/src/gallium/auxiliary/util/u_screen.h
+++ b/src/gallium/auxiliary/util/u_screen.h
@@ -24,6 +24,14 @@
struct pipe_screen;
enum pipe_cap;
+#ifdef __cplusplus
+extern "C" {
+#endif
+
int
u_pipe_screen_get_param_defaults(struct pipe_screen *pscreen,
enum pipe_cap param);
+
+#ifdef __cplusplus
+};
+#endif
diff --git a/src/gallium/auxiliary/util/u_threaded_context.c b/src/gallium/auxiliary/util/u_threaded_context.c
index 8e3bceae18d..b596c322918 100644
--- a/src/gallium/auxiliary/util/u_threaded_context.c
+++ b/src/gallium/auxiliary/util/u_threaded_context.c
@@ -1524,7 +1524,8 @@ tc_buffer_do_flush_region(struct threaded_context *tc,
if (ttrans->staging) {
struct pipe_box src_box;
- u_box_1d(ttrans->offset + box->x % tc->map_buffer_alignment,
+ u_box_1d(ttrans->offset + ttrans->b.box.x % tc->map_buffer_alignment +
+ (box->x - ttrans->b.box.x),
box->width, &src_box);
/* Copy the staging buffer into the original one. */
diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst
index eaf492ce8b0..b927d014179 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -487,6 +487,10 @@ The integer capabilities:
* ``PIPE_CAP_DEST_SURFACE_SRGB_CONTROL``: Indicates whether the drivers
supports switching the format between sRGB and linear for a surface that is
used as destination in draw and blit calls.
+* ``PIPE_CAP_MAX_VARYINGS``: The maximum number of fragment shader
+ varyings. This will generally correspond to
+ ``PIPE_SHADER_CAP_MAX_INPUTS`` for the fragment shader, but in some
+ cases may be a smaller number.
.. _pipe_capf:
diff --git a/src/gallium/drivers/etnaviv/etnaviv_emit.c b/src/gallium/drivers/etnaviv/etnaviv_emit.c
index f788896dd8d..e178f02b7dc 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_emit.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_emit.c
@@ -592,12 +592,12 @@ etna_emit_state(struct etna_context *ctx)
static const uint32_t uniform_dirty_bits =
ETNA_DIRTY_SHADER | ETNA_DIRTY_CONSTBUF;
- if (dirty & (uniform_dirty_bits | ctx->shader.fs->uniforms_dirty_bits))
+ if (dirty & (uniform_dirty_bits | ctx->shader.vs->uniforms_dirty_bits))
etna_uniforms_write(
ctx, ctx->shader.vs, &ctx->constant_buffer[PIPE_SHADER_VERTEX],
ctx->shader_state.VS_UNIFORMS, &ctx->shader_state.vs_uniforms_size);
- if (dirty & (uniform_dirty_bits | ctx->shader.vs->uniforms_dirty_bits))
+ if (dirty & (uniform_dirty_bits | ctx->shader.fs->uniforms_dirty_bits))
etna_uniforms_write(
ctx, ctx->shader.fs, &ctx->constant_buffer[PIPE_SHADER_FRAGMENT],
ctx->shader_state.PS_UNIFORMS, &ctx->shader_state.ps_uniforms_size);
diff --git a/src/gallium/drivers/etnaviv/etnaviv_resource.c b/src/gallium/drivers/etnaviv/etnaviv_resource.c
index 9a7ebf3064e..fdde07525f3 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_resource.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_resource.c
@@ -610,6 +610,7 @@ etna_resource_get_handle(struct pipe_screen *pscreen,
rsc = etna_resource(rsc->external);
handle->stride = rsc->levels[0].stride;
+ handle->offset = rsc->levels[0].offset;
handle->modifier = layout_to_modifier(rsc->layout);
if (handle->type == WINSYS_HANDLE_TYPE_SHARED) {
diff --git a/src/gallium/drivers/etnaviv/etnaviv_screen.c b/src/gallium/drivers/etnaviv/etnaviv_screen.c
index fd320232528..35dcac1409b 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_screen.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_screen.c
@@ -360,6 +360,9 @@ etna_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
return 0;
+ case PIPE_CAP_MAX_VARYINGS:
+ return screen->specs.max_varyings;
+
case PIPE_CAP_PCI_GROUP:
case PIPE_CAP_PCI_BUS:
case PIPE_CAP_PCI_DEVICE:
diff --git a/src/gallium/drivers/freedreno/Makefile.am b/src/gallium/drivers/freedreno/Makefile.am
index fe409fa5f52..dbc15f40389 100644
--- a/src/gallium/drivers/freedreno/Makefile.am
+++ b/src/gallium/drivers/freedreno/Makefile.am
@@ -23,4 +23,6 @@ libfreedreno_la_SOURCES = \
$(a6xx_SOURCES) \
$(ir3_SOURCES)
-EXTRA_DIST = meson.build
+EXTRA_DIST = \
+ ir3/ir3_cmdline.c \
+ meson.build
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c
index 772127c7478..498c1eae1d7 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c
@@ -339,7 +339,6 @@ clear_fast(struct fd_batch *batch, struct fd_ringbuffer *ring,
OUT_PKT3(ring, CP_SET_CONSTANT, 2);
OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_SCREEN_SCISSOR_BR));
OUT_RINGP(ring, patch_type, &batch->gmem_patches);
- OUT_RING(ring, 0);
OUT_PKT3(ring, CP_SET_CONSTANT, 4);
OUT_RING(ring, CP_REG(REG_A2XX_RB_SURFACE_INFO));
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c b/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c
index 1c073e31739..692188ebd4e 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c
@@ -59,6 +59,28 @@ static uint32_t fmt2swap(enum pipe_format format)
}
}
+static bool
+use_hw_binning(struct fd_batch *batch)
+{
+ struct fd_gmem_stateobj *gmem = &batch->ctx->gmem;
+
+ /* we hardcoded a limit of 8 "pipes", we can increase this limit
+ * at the cost of a slightly larger command stream
+ * however very few cases will need more than 8
+ * gmem->num_vsc_pipes == 0 means empty batch (TODO: does it still happen?)
+ */
+ if (gmem->num_vsc_pipes > 8 || !gmem->num_vsc_pipes)
+ return false;
+
+ /* only a20x hw binning is implement
+ * a22x is more like a3xx, but perhaps the a20x works? (TODO)
+ */
+ if (!is_a20x(batch->ctx->screen))
+ return false;
+
+ return fd_binning_enabled && ((gmem->nbins_x * gmem->nbins_y) > 2);
+}
+
/* transfer from gmem to system memory (ie. normal RAM) */
static void
@@ -272,7 +294,7 @@ fd2_emit_tile_mem2gmem(struct fd_batch *batch, struct fd_tile *tile)
x1 = ((float)tile->xoff + bin_w) / ((float)pfb->width);
y0 = ((float)tile->yoff) / ((float)pfb->height);
y1 = ((float)tile->yoff + bin_h) / ((float)pfb->height);
- OUT_PKT3(ring, CP_MEM_WRITE, 9);
+ OUT_PKT3(ring, CP_MEM_WRITE, 7);
OUT_RELOC(ring, fd_resource(fd2_ctx->solid_vertexbuf)->bo, 36, 0, 0);
OUT_RING(ring, fui(x0));
OUT_RING(ring, fui(y0));
@@ -280,8 +302,6 @@ fd2_emit_tile_mem2gmem(struct fd_batch *batch, struct fd_tile *tile)
OUT_RING(ring, fui(y0));
OUT_RING(ring, fui(x0));
OUT_RING(ring, fui(y1));
- OUT_RING(ring, fui(x1));
- OUT_RING(ring, fui(y1));
OUT_PKT3(ring, CP_SET_CONSTANT, 2);
OUT_RING(ring, CP_REG(REG_A2XX_VGT_INDX_OFFSET));
@@ -492,18 +512,18 @@ fd2_emit_tile_init(struct fd_batch *batch)
/* note: 1 "line" is 512 bytes in both color/depth areas (1K total) */
switch (patch->val) {
case GMEM_PATCH_FASTCLEAR_COLOR:
- size = align(gmem->bin_w * gmem->bin_h * color_size, 0x4000);
+ size = align(gmem->bin_w * gmem->bin_h * color_size, 0x8000);
lines = size / 1024;
depth_base = size / 2;
break;
case GMEM_PATCH_FASTCLEAR_DEPTH:
- size = align(gmem->bin_w * gmem->bin_h * depth_size, 0x4000);
+ size = align(gmem->bin_w * gmem->bin_h * depth_size, 0x8000);
lines = size / 1024;
color_base = depth_base;
depth_base = depth_base + size / 2;
break;
case GMEM_PATCH_FASTCLEAR_COLOR_DEPTH:
- lines = align(gmem->bin_w * gmem->bin_h * color_size * 2, 0x4000) / 1024;
+ lines = align(gmem->bin_w * gmem->bin_h * color_size * 2, 0x8000) / 1024;
break;
case GMEM_PATCH_RESTORE_INFO:
patch->cs[0] = gmem->bin_w;
@@ -535,7 +555,7 @@ fd2_emit_tile_init(struct fd_batch *batch)
OUT_RING(ring, CP_REG(REG_A2XX_VGT_CURRENT_BIN_ID_MAX));
OUT_RING(ring, 0);
- if (is_a20x(ctx->screen) && fd_binning_enabled && gmem->num_vsc_pipes) {
+ if (use_hw_binning(batch)) {
/* patch out unneeded memory exports by changing EXEC CF to EXEC_END
*
* in the shader compiler, we guarantee that the shader ends with
@@ -694,7 +714,7 @@ fd2_emit_tile_renderprep(struct fd_batch *batch, struct fd_tile *tile)
OUT_RING(ring, fui(0.0f));
}
- if (is_a20x(ctx->screen) && fd_binning_enabled) {
+ if (use_hw_binning(batch)) {
struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[tile->p];
OUT_PKT3(ring, CP_SET_CONSTANT, 2);
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_resource.c b/src/gallium/drivers/freedreno/a2xx/fd2_resource.c
index 1bd1f103ccd..2c813804689 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_resource.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_resource.c
@@ -55,6 +55,12 @@ fd2_setup_slices(struct fd_resource *rsc)
break;
}
+ /* mipmaps have power of two sizes in memory */
+ if (level) {
+ width = util_next_power_of_two(width);
+ height = util_next_power_of_two(height);
+ }
+
slice->pitch = width;
slice->offset = size;
diff --git a/src/gallium/drivers/freedreno/a2xx/ir2_nir.c b/src/gallium/drivers/freedreno/a2xx/ir2_nir.c
index 5d92f86befc..b206911270a 100644
--- a/src/gallium/drivers/freedreno/a2xx/ir2_nir.c
+++ b/src/gallium/drivers/freedreno/a2xx/ir2_nir.c
@@ -74,7 +74,7 @@ ir2_optimize_loop(nir_shader *s)
progress |= OPT(s, nir_opt_dce);
progress |= OPT(s, nir_opt_cse);
/* progress |= OPT(s, nir_opt_gcm, true); */
- progress |= OPT(s, nir_opt_peephole_select, UINT_MAX, true, true);
+ progress |= OPT(s, nir_opt_peephole_select, UINT_MAX, true);
progress |= OPT(s, nir_opt_intrinsics);
progress |= OPT(s, nir_opt_algebraic);
progress |= OPT(s, nir_opt_constant_folding);
diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_blitter.c b/src/gallium/drivers/freedreno/a6xx/fd6_blitter.c
index 460255f748a..c8719636182 100644
--- a/src/gallium/drivers/freedreno/a6xx/fd6_blitter.c
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_blitter.c
@@ -438,7 +438,7 @@ emit_blit_texture(struct fd_ringbuffer *ring, const struct pipe_blit_info *info)
OUT_RING(ring, A6XX_RB_2D_DST_INFO_COLOR_FORMAT(dfmt) |
A6XX_RB_2D_DST_INFO_TILE_MODE(dtile) |
A6XX_RB_2D_DST_INFO_COLOR_SWAP(dswap));
- OUT_RELOC(ring, dst->bo, doff, 0, 0); /* RB_2D_DST_LO/HI */
+ OUT_RELOCW(ring, dst->bo, doff, 0, 0); /* RB_2D_DST_LO/HI */
OUT_RING(ring, A6XX_RB_2D_DST_SIZE_PITCH(dpitch));
OUT_RING(ring, 0x00000000);
OUT_RING(ring, 0x00000000);
diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_gmem.c b/src/gallium/drivers/freedreno/a6xx/fd6_gmem.c
index cb5c582476f..38825d0aa4d 100644
--- a/src/gallium/drivers/freedreno/a6xx/fd6_gmem.c
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_gmem.c
@@ -588,6 +588,13 @@ set_blit_scissor(struct fd_batch *batch, struct fd_ringbuffer *ring)
blit_scissor.maxx = MIN2(pfb->width, batch->max_scissor.maxx);
blit_scissor.maxy = MIN2(pfb->height, batch->max_scissor.maxy);
+ /* NOTE: blob switches to CP_BLIT instead of CP_EVENT_WRITE:BLIT for
+ * small render targets. But since we align pitch to binw I think
+ * we can get away avoiding GPU hangs a simpler way, by just rounding
+ * up the blit scissor:
+ */
+ blit_scissor.maxx = MAX2(blit_scissor.maxx, batch->ctx->screen->gmem_alignw);
+
OUT_PKT4(ring, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
OUT_RING(ring,
A6XX_RB_BLIT_SCISSOR_TL_X(blit_scissor.minx) |
diff --git a/src/gallium/drivers/freedreno/freedreno_resource.c b/src/gallium/drivers/freedreno/freedreno_resource.c
index ce8e4480be1..1879d2c60ed 100644
--- a/src/gallium/drivers/freedreno/freedreno_resource.c
+++ b/src/gallium/drivers/freedreno/freedreno_resource.c
@@ -853,7 +853,13 @@ fd_resource_create_with_modifiers(struct pipe_screen *pscreen,
enum pipe_format format = tmpl->format;
uint32_t size;
- if (screen->ro && (tmpl->bind & PIPE_BIND_SCANOUT)) {
+ /* when using kmsro, scanout buffers are allocated on the display device
+ * create_with_modifiers() doesn't give us usage flags, so we have to
+ * assume that all calls with modifiers are scanout-possible
+ */
+ if (screen->ro &&
+ ((tmpl->bind & PIPE_BIND_SCANOUT) ||
+ !(count == 1 && modifiers[0] == DRM_FORMAT_MOD_INVALID))) {
struct pipe_resource scanout_templat = *tmpl;
struct renderonly_scanout *scanout;
struct winsys_handle handle;
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index e596a4e8462..c3b08ab0e0f 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -317,6 +317,9 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
case PIPE_CAP_MAX_VIEWPORTS:
return 1;
+ case PIPE_CAP_MAX_VARYINGS:
+ return 16;
+
case PIPE_CAP_SHAREABLE_SHADERS:
case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY:
/* manage the variants for these ourself, to avoid breaking precompile: */
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
index a7b4a43c015..78707c66e62 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -402,6 +402,8 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
return 0;
case PIPE_CAP_ENDIANNESS:
return PIPE_ENDIAN_LITTLE;
+ case PIPE_CAP_MAX_VARYINGS:
+ return 10;
case PIPE_CAP_VENDOR_ID:
return 0x8086;
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index c95016a6cbe..b55b4a3c4fe 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -310,6 +310,8 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
return 1;
case PIPE_CAP_CLEAR_TEXTURE:
return 1;
+ case PIPE_CAP_MAX_VARYINGS:
+ return 32;
case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
diff --git a/src/gallium/drivers/nouveau/codegen/lib/gk104.asm b/src/gallium/drivers/nouveau/codegen/lib/gk104.asm
index cd65b547279..576da1bab60 100644
--- a/src/gallium/drivers/nouveau/codegen/lib/gk104.asm
+++ b/src/gallium/drivers/nouveau/codegen/lib/gk104.asm
@@ -543,6 +543,8 @@ $p2 suldgb b32 $r3 cg zero u8 g[$r4d] $r2 $p0
$p1 suldgb b32 $r3 cv zero u8 g[$r4d] $r2 $p0
long mov b32 $r3 0x3f800000
long nop
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+long nop
long ret
@@ -554,7 +556,144 @@ long ret
// SIZE: 9 * 8 bytes
//
gk104_rcp_f64:
- long nop
+ // Step 1: classify input according to exponent and value, and calculate
+ // result for 0/inf/nan. $r2 holds the exponent value, which starts at
+ // bit 52 (bit 20 of the upper half) and is 11 bits in length
+ ext u32 $r2 $r1 0xb14
+ add b32 $r3 $r2 0xffffffff
+ joinat #rcp_rejoin
+ // We want to check whether the exponent is 0 or 0x7ff (i.e. NaN, inf,
+ // denorm, or 0). Do this by substracting 1 from the exponent, which will
+ // mean that it's > 0x7fd in those cases when doing unsigned comparison
+ set $p0 0x1 gt u32 $r3 0x7fd
+ // $r3: 0 for norms, 0x36 for denorms, -1 for others
+ long mov b32 $r3 0x0
+ sched 0x2f 0x04 0x2d 0x2b 0x2f 0x28 0x28
+ join (not $p0) nop
+ // Process all special values: NaN, inf, denorm, 0
+ mov b32 $r3 0xffffffff
+ // A number is NaN if its abs value is greater than or unordered with inf
+ set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000
+ (not $p0) bra #rcp_inf_or_denorm_or_zero
+ // NaN -> NaN, the next line sets the "quiet" bit of the result. This
+ // behavior is both seen on the CPU and the blob
+ join or b32 $r1 $r1 0x80000
+rcp_inf_or_denorm_or_zero:
+ and b32 $r4 $r1 0x7ff00000
+ // Other values with nonzero in exponent field should be inf
+ set $p0 0x1 eq s32 $r4 0x0
+ sched 0x2b 0x04 0x2f 0x2d 0x2b 0x2f 0x20
+ $p0 bra #rcp_denorm_or_zero
+ // +/-Inf -> +/-0
+ xor b32 $r1 $r1 0x7ff00000
+ join mov b32 $r0 0x0
+rcp_denorm_or_zero:
+ set $p0 0x1 gtu f64 abs $r0d 0x0
+ $p0 bra #rcp_denorm
+ // +/-0 -> +/-Inf
+ join or b32 $r1 $r1 0x7ff00000
+rcp_denorm:
+ // non-0 denorms: multiply with 2^54 (the 0x36 in $r3), join with norms
+ mul rn f64 $r0d $r0d 0x4350000000000000
+ sched 0x2f 0x28 0x2b 0x28 0x28 0x04 0x28
+ join mov b32 $r3 0x36
+rcp_rejoin:
+ // All numbers with -1 in $r3 have their result ready in $r0d, return them
+ // others need further calculation
+ set $p0 0x1 lt s32 $r3 0x0
+ $p0 bra #rcp_end
+ // Step 2: Before the real calculation goes on, renormalize the values to
+ // range [1, 2) by setting exponent field to 0x3ff (the exponent of 1)
+ // result in $r6d. The exponent will be recovered later.
+ ext u32 $r2 $r1 0xb14
+ and b32 $r7 $r1 0x800fffff
+ add b32 $r7 $r7 0x3ff00000
+ long mov b32 $r6 $r0
+ sched 0x2b 0x04 0x28 0x28 0x2a 0x2b 0x2e
+ // Step 3: Convert new value to float (no overflow will occur due to step
+ // 2), calculate rcp and do newton-raphson step once
+ cvt rz f32 $r5 f64 $r6d
+ long rcp f32 $r4 $r5
+ mov b32 $r0 0xbf800000
+ fma rn f32 $r5 $r4 $r5 $r0
+ fma rn f32 $r0 neg $r4 $r5 $r4
+ // Step 4: convert result $r0 back to double, do newton-raphson steps
+ cvt f64 $r0d f32 $r0
+ cvt f64 $r6d neg f64 $r6d
+ sched 0x2e 0x29 0x29 0x29 0x29 0x29 0x29
+ cvt f64 $r8d f32 0x3f800000
+ // 4 Newton-Raphson Steps, tmp in $r4d, result in $r0d
+ // The formula used here (and above) is:
+ // RCP_{n + 1} = 2 * RCP_{n} - x * RCP_{n} * RCP_{n}
+ // The following code uses 2 FMAs for each step, and it will basically
+ // looks like:
+ // tmp = -src * RCP_{n} + 1
+ // RCP_{n + 1} = RCP_{n} * tmp + RCP_{n}
+ fma rn f64 $r4d $r6d $r0d $r8d
+ fma rn f64 $r0d $r0d $r4d $r0d
+ fma rn f64 $r4d $r6d $r0d $r8d
+ fma rn f64 $r0d $r0d $r4d $r0d
+ fma rn f64 $r4d $r6d $r0d $r8d
+ fma rn f64 $r0d $r0d $r4d $r0d
+ sched 0x29 0x20 0x28 0x28 0x28 0x28 0x28
+ fma rn f64 $r4d $r6d $r0d $r8d
+ fma rn f64 $r0d $r0d $r4d $r0d
+ // Step 5: Exponent recovery and final processing
+ // The exponent is recovered by adding what we added to the exponent.
+ // Suppose we want to calculate rcp(x), but we have rcp(cx), then
+ // rcp(x) = c * rcp(cx)
+ // The delta in exponent comes from two sources:
+ // 1) The renormalization in step 2. The delta is:
+ // 0x3ff - $r2
+ // 2) (For the denorm input) The 2^54 we multiplied at rcp_denorm, stored
+ // in $r3
+ // These 2 sources are calculated in the first two lines below, and then
+ // added to the exponent extracted from the result above.
+ // Note that after processing, the new exponent may >= 0x7ff (inf)
+ // or <= 0 (denorm). Those cases will be handled respectively below
+ subr b32 $r2 $r2 0x3ff
+ long add b32 $r4 $r2 $r3
+ ext u32 $r3 $r1 0xb14
+ // New exponent in $r3
+ long add b32 $r3 $r3 $r4
+ add b32 $r2 $r3 0xffffffff
+ sched 0x28 0x2b 0x28 0x2b 0x28 0x28 0x2b
+ // (exponent-1) < 0x7fe (unsigned) means the result is in norm range
+ // (same logic as in step 1)
+ set $p0 0x1 lt u32 $r2 0x7fe
+ (not $p0) bra #rcp_result_inf_or_denorm
+ // Norms: convert exponents back and return
+ shl b32 $r4 $r4 clamp 0x14
+ long add b32 $r1 $r4 $r1
+ bra #rcp_end
+rcp_result_inf_or_denorm:
+ // New exponent >= 0x7ff means that result is inf
+ set $p0 0x1 ge s32 $r3 0x7ff
+ (not $p0) bra #rcp_result_denorm
+ sched 0x20 0x25 0x28 0x2b 0x23 0x25 0x2f
+ // Infinity
+ and b32 $r1 $r1 0x80000000
+ long mov b32 $r0 0x0
+ add b32 $r1 $r1 0x7ff00000
+ bra #rcp_end
+rcp_result_denorm:
+ // Denorm result comes from huge input. The greatest possible fp64, i.e.
+ // 0x7fefffffffffffff's rcp is 0x0004000000000000, 1/4 of the smallest
+ // normal value. Other rcp result should be greater than that. If we
+ // set the exponent field to 1, we can recover the result by multiplying
+ // it with 1/2 or 1/4. 1/2 is used if the "exponent" $r3 is 0, otherwise
+ // 1/4 ($r3 should be -1 then). This is quite tricky but greatly simplifies
+ // the logic here.
+ set $p0 0x1 ne u32 $r3 0x0
+ and b32 $r1 $r1 0x800fffff
+ // 0x3e800000: 1/4
+ $p0 cvt f64 $r6d f32 0x3e800000
+ sched 0x2f 0x28 0x2c 0x2e 0x2a 0x20 0x27
+ // 0x3f000000: 1/2
+ (not $p0) cvt f64 $r6d f32 0x3f000000
+ add b32 $r1 $r1 0x00100000
+ mul rn f64 $r0d $r0d $r6d
+rcp_end:
long ret
// RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i)
@@ -565,7 +704,67 @@ gk104_rcp_f64:
// SIZE: 14 * 8 bytes
//
gk104_rsq_f64:
- long nop
+ // Before getting initial result rsqrt64h, two special cases should be
+ // handled first.
+ // 1. NaN: set the highest bit in mantissa so it'll be surely recognized
+ // as NaN in rsqrt64h
+ set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000
+ $p0 or b32 $r1 $r1 0x00080000
+ and b32 $r2 $r1 0x7fffffff
+ sched 0x27 0x20 0x28 0x2c 0x25 0x28 0x28
+ // 2. denorms and small normal values: using their original value will
+ // lose precision either at rsqrt64h or the first step in newton-raphson
+ // steps below. Take 2 as a threshold in exponent field, and multiply
+ // with 2^54 if the exponent is smaller or equal. (will multiply 2^27
+ // to recover in the end)
+ ext u32 $r3 $r1 0xb14
+ set $p1 0x1 le u32 $r3 0x2
+ long or b32 $r2 $r0 $r2
+ $p1 mul rn f64 $r0d $r0d 0x4350000000000000
+ rsqrt64h $r5 $r1
+ // rsqrt64h will give correct result for 0/inf/nan, the following logic
+ // checks whether the input is one of those (exponent is 0x7ff or all 0
+ // except for the sign bit)
+ set b32 $r6 ne u32 $r3 0x7ff
+ long and b32 $r2 $r2 $r6
+ sched 0x28 0x2b 0x20 0x27 0x28 0x2e 0x28
+ set $p0 0x1 ne u32 $r2 0x0
+ $p0 bra #rsq_norm
+ // For 0/inf/nan, make sure the sign bit agrees with input and return
+ and b32 $r1 $r1 0x80000000
+ long mov b32 $r0 0x0
+ long or b32 $r1 $r1 $r5
+ long ret
+rsq_norm:
+ // For others, do 4 Newton-Raphson steps with the formula:
+ // RSQ_{n + 1} = RSQ_{n} * (1.5 - 0.5 * x * RSQ_{n} * RSQ_{n})
+ // In the code below, each step is written as:
+ // tmp1 = 0.5 * x * RSQ_{n}
+ // tmp2 = -RSQ_{n} * tmp1 + 0.5
+ // RSQ_{n + 1} = RSQ_{n} * tmp2 + RSQ_{n}
+ long mov b32 $r4 0x0
+ sched 0x2f 0x29 0x29 0x29 0x29 0x29 0x29
+ // 0x3f000000: 1/2
+ cvt f64 $r8d f32 0x3f000000
+ mul rn f64 $r2d $r0d $r8d
+ mul rn f64 $r0d $r2d $r4d
+ fma rn f64 $r6d neg $r4d $r0d $r8d
+ fma rn f64 $r4d $r4d $r6d $r4d
+ mul rn f64 $r0d $r2d $r4d
+ fma rn f64 $r6d neg $r4d $r0d $r8d
+ sched 0x29 0x29 0x29 0x29 0x29 0x29 0x29
+ fma rn f64 $r4d $r4d $r6d $r4d
+ mul rn f64 $r0d $r2d $r4d
+ fma rn f64 $r6d neg $r4d $r0d $r8d
+ fma rn f64 $r4d $r4d $r6d $r4d
+ mul rn f64 $r0d $r2d $r4d
+ fma rn f64 $r6d neg $r4d $r0d $r8d
+ fma rn f64 $r4d $r4d $r6d $r4d
+ sched 0x29 0x20 0x28 0x2e 0x00 0x00 0x00
+ // Multiply 2^27 to result for small inputs to recover
+ $p1 mul rn f64 $r4d $r4d 0x41a0000000000000
+ long mov b32 $r1 $r5
+ long mov b32 $r0 $r4
long ret
//
diff --git a/src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h b/src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h
index 37998768efe..ed948dee471 100644
--- a/src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h
+++ b/src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h
@@ -481,12 +481,132 @@ uint64_t gk104_builtin_code[] = {
0xd40040000840c785,
0x18fe00000000dde2,
0x4000000000001de4,
- 0x9000000000001de7,
-/* 0x0f08: gk104_rcp_f64 */
+ 0x2000000000000007,
0x4000000000001de4,
0x9000000000001de7,
-/* 0x0f18: gk104_rsq_f64 */
- 0x4000000000001de4,
+/* 0x0f18: gk104_rcp_f64 */
+ 0x7000c02c50109c03,
+ 0x0bfffffffc20dc02,
+ 0x6000000280000007,
+ 0x1a0ec01ff431dc03,
+ 0x180000000000dde2,
+ 0x228282f2b2d042f7,
+ 0x40000000000021f4,
+ 0x1bfffffffc00dde2,
+ 0x1e0edffc0001dc81,
+ 0x40000000200021e7,
+ 0x3800200000105c52,
+/* 0x0f70: rcp_inf_or_denorm_or_zero */
+ 0x39ffc00000111c02,
+ 0x190e0000fc41dc23,
+ 0x2202f2b2d2f042b7,
+ 0x40000000400001e7,
+ 0x39ffc00000105c82,
+ 0x1800000000001df2,
+/* 0x0fa0: rcp_denorm_or_zero */
+ 0x1e0ec0000001dc81,
+ 0x40000000200001e7,
+ 0x39ffc00000105c52,
+/* 0x0fb8: rcp_denorm */
+ 0x5000d0d400001c01,
+ 0x2280428282b282f7,
+ 0x18000000d800ddf2,
+/* 0x0fd0: rcp_rejoin */
+ 0x188e0000fc31dc23,
+ 0x40000006000001e7,
+ 0x7000c02c50109c03,
+ 0x3a003ffffc11dc02,
+ 0x08ffc0000071dc02,
+ 0x2800000000019de4,
+ 0x22e2b2a2828042b7,
+ 0x1006000019a15c04,
+ 0xc800000010511c00,
+ 0x1afe000000001de2,
+ 0x3000000014415c00,
+ 0x3008000014401e00,
+ 0x1000000001301c04,
+ 0x1000000019b19d04,
+ 0x22929292929292e7,
+ 0x1000cfe001321c04,
+ 0x2010000000611c01,
+ 0x2000000010001c01,
+ 0x2010000000611c01,
+ 0x2000000010001c01,
+ 0x2010000000611c01,
+ 0x2000000010001c01,
+ 0x2282828282820297,
+ 0x2010000000611c01,
+ 0x2000000010001c01,
+ 0x0800000ffc209e02,
+ 0x480000000c211c03,
+ 0x7000c02c5010dc03,
+ 0x480000001030dc03,
+ 0x0bfffffffc309c02,
+ 0x22b28282b282b287,
+ 0x188ec01ff821dc03,
+ 0x40000000600021e7,
+ 0x6000c00050411c03,
+ 0x4800000004405c03,
+ 0x40000001c0001de7,
+/* 0x10f0: rcp_result_inf_or_denorm */
+ 0x1b0ec01ffc31dc23,
+ 0x40000000a00021e7,
+ 0x22f25232b2825207,
+ 0x3a00000000105c02,
+ 0x1800000000001de2,
+ 0x09ffc00000105c02,
+ 0x40000000e0001de7,
+/* 0x1128: rcp_result_denorm */
+ 0x1a8e0000fc31dc03,
+ 0x3a003ffffc105c02,
+ 0x1000cfa001318004,
+ 0x227202a2e2c282f7,
+ 0x1000cfc00131a004,
+ 0x0800400000105c02,
+ 0x5000000018001c01,
+/* 0x1160: rcp_end */
+ 0x9000000000001de7,
+/* 0x1168: gk104_rsq_f64 */
+ 0x1e0edffc0001dc81,
+ 0x3800200000104042,
+ 0x39fffffffc109c02,
+ 0x22828252c2820277,
+ 0x7000c02c5010dc03,
+ 0x198ec0000833dc03,
+ 0x6800000008009c43,
+ 0x5000d0d400000401,
+ 0xc80000001c115c00,
+ 0x128ec01ffc319c03,
+ 0x6800000018209c03,
+ 0x2282e2827202b287,
+ 0x1a8e0000fc21dc03,
+ 0x40000000800001e7,
+ 0x3a00000000105c02,
+ 0x1800000000001de2,
+ 0x6800000014105c43,
+ 0x9000000000001de7,
+/* 0x11f8: rsq_norm */
+ 0x1800000000011de2,
+ 0x22929292929292f7,
+ 0x1000cfc001321c04,
+ 0x5000000020009c01,
+ 0x5000000010201c01,
+ 0x2010000000419e01,
+ 0x2008000018411c01,
+ 0x5000000010201c01,
+ 0x2010000000419e01,
+ 0x2292929292929297,
+ 0x2008000018411c01,
+ 0x5000000010201c01,
+ 0x2010000000419e01,
+ 0x2008000018411c01,
+ 0x5000000010201c01,
+ 0x2010000000419e01,
+ 0x2008000018411c01,
+ 0x20000002e2820297,
+ 0x5000d06800410401,
+ 0x2800000014005de4,
+ 0x2800000010001de4,
0x9000000000001de7,
0xc800000003f01cc5,
0x2c00000100005c04,
@@ -495,7 +615,7 @@ uint64_t gk104_builtin_code[] = {
0x680100000c1fdc03,
0x4000000a60001c47,
0x180000004000dde2,
-/* 0x0f60: spill_cfstack */
+/* 0x12e0: spill_cfstack */
0x78000009c0000007,
0x0c0000000430dd02,
0x4003ffffa0001ca7,
@@ -543,14 +663,14 @@ uint64_t gk104_builtin_code[] = {
0x4000000100001ea7,
0x480100000c001c03,
0x0800000000105c42,
-/* 0x10d8: shared_loop */
+/* 0x1458: shared_loop */
0xc100000000309c85,
0x9400000500009c85,
0x0c00000010001d02,
0x0800000000105d42,
0x0c0000001030dd02,
0x4003ffff40001ca7,
-/* 0x1108: shared_done */
+/* 0x1488: shared_done */
0x2800406420001de4,
0x2800406430005de4,
0xe000000000001c45,
@@ -564,7 +684,7 @@ uint64_t gk104_builtin_code[] = {
0x480000000c209c03,
0x4801000008001c03,
0x0800000000105c42,
-/* 0x1170: search_cstack */
+/* 0x14f0: search_cstack */
0x280040646000dde4,
0x8400000020009f05,
0x190ec0002821dc03,
@@ -573,17 +693,17 @@ uint64_t gk104_builtin_code[] = {
0x0800000000105c42,
0x0c0000004030dd02,
0x00029dff0ffc5cbf,
-/* 0x11b0: entry_found */
+/* 0x1530: entry_found */
0x8400000000009f85,
0x2800406400001de4,
0x2800406410005de4,
0x9400000010009c85,
0x4000000000001df4,
-/* 0x11d8: end_exit */
+/* 0x1558: end_exit */
0x9800000003ffdcc5,
0xd000000000008007,
0xa000000000004007,
-/* 0x11f0: end_cont */
+/* 0x1570: end_cont */
0xd000000000008007,
0x3400c3fffc201c04,
0xc000000003f01ec5,
@@ -593,6 +713,6 @@ uint64_t gk104_builtin_code[] = {
uint64_t gk104_builtin_offsets[] = {
0x0000000000000000,
0x00000000000000f0,
- 0x0000000000000f08,
0x0000000000000f18,
+ 0x0000000000001168,
};
diff --git a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm
index b9c05a04b9a..4047a565a9f 100644
--- a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm
+++ b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm
@@ -83,12 +83,229 @@ gk110_div_s32:
$p0 sub b32 $r1 $r1 $r2
$p0 add b32 $r0 $r0 0x1
$p3 cvt s32 $r0 neg s32 $r0
- sched 0x04 0x2e 0x04 0x28 0x04 0x20 0x2c
+ sched 0x04 0x2e 0x28 0x04 0x28 0x28 0x28
$p2 cvt s32 $r1 neg s32 $r1
ret
+// RCP F64
+//
+// INPUT: $r0d
+// OUTPUT: $r0d
+// CLOBBER: $r2 - $r9, $p0
+//
+// The core of RCP and RSQ implementation is Newton-Raphson step, which is
+// used to find successively better approximation from an imprecise initial
+// value (single precision rcp in RCP and rsqrt64h in RSQ).
+//
gk110_rcp_f64:
+ // Step 1: classify input according to exponent and value, and calculate
+ // result for 0/inf/nan. $r2 holds the exponent value, which starts at
+ // bit 52 (bit 20 of the upper half) and is 11 bits in length
+ ext u32 $r2 $r1 0xb14
+ add b32 $r3 $r2 0xffffffff
+ joinat #rcp_rejoin
+ // We want to check whether the exponent is 0 or 0x7ff (i.e. NaN, inf,
+ // denorm, or 0). Do this by substracting 1 from the exponent, which will
+ // mean that it's > 0x7fd in those cases when doing unsigned comparison
+ set b32 $p0 0x1 gt u32 $r3 0x7fd
+ // $r3: 0 for norms, 0x36 for denorms, -1 for others
+ mov b32 $r3 0x0
+ sched 0x2f 0x04 0x2d 0x2b 0x2f 0x28 0x28
+ join (not $p0) nop
+ // Process all special values: NaN, inf, denorm, 0
+ mov b32 $r3 0xffffffff
+ // A number is NaN if its abs value is greater than or unordered with inf
+ set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000
+ (not $p0) bra #rcp_inf_or_denorm_or_zero
+ // NaN -> NaN, the next line sets the "quiet" bit of the result. This
+ // behavior is both seen on the CPU and the blob
+ join or b32 $r1 $r1 0x80000
+rcp_inf_or_denorm_or_zero:
+ and b32 $r4 $r1 0x7ff00000
+ // Other values with nonzero in exponent field should be inf
+ set b32 $p0 0x1 eq s32 $r4 0x0
+ sched 0x2b 0x04 0x2f 0x2d 0x2b 0x2f 0x20
+ $p0 bra #rcp_denorm_or_zero
+ // +/-Inf -> +/-0
+ xor b32 $r1 $r1 0x7ff00000
+ join mov b32 $r0 0x0
+rcp_denorm_or_zero:
+ set $p0 0x1 gtu f64 abs $r0d 0x0
+ $p0 bra #rcp_denorm
+ // +/-0 -> +/-Inf
+ join or b32 $r1 $r1 0x7ff00000
+rcp_denorm:
+ // non-0 denorms: multiply with 2^54 (the 0x36 in $r3), join with norms
+ mul rn f64 $r0d $r0d 0x4350000000000000
+ sched 0x2f 0x28 0x2b 0x28 0x28 0x04 0x28
+ join mov b32 $r3 0x36
+rcp_rejoin:
+ // All numbers with -1 in $r3 have their result ready in $r0d, return them
+ // others need further calculation
+ set b32 $p0 0x1 lt s32 $r3 0x0
+ $p0 bra #rcp_end
+ // Step 2: Before the real calculation goes on, renormalize the values to
+ // range [1, 2) by setting exponent field to 0x3ff (the exponent of 1)
+ // result in $r6d. The exponent will be recovered later.
+ ext u32 $r2 $r1 0xb14
+ and b32 $r7 $r1 0x800fffff
+ add b32 $r7 $r7 0x3ff00000
+ mov b32 $r6 $r0
+ sched 0x2b 0x04 0x28 0x28 0x2a 0x2b 0x2e
+ // Step 3: Convert new value to float (no overflow will occur due to step
+ // 2), calculate rcp and do newton-raphson step once
+ cvt rz f32 $r5 f64 $r6d
+ rcp f32 $r4 $r5
+ mov b32 $r0 0xbf800000
+ fma rn f32 $r5 $r4 $r5 $r0
+ fma rn f32 $r0 neg $r4 $r5 $r4
+ // Step 4: convert result $r0 back to double, do newton-raphson steps
+ cvt f64 $r0d f32 $r0
+ cvt f64 $r6d f64 neg $r6d
+ sched 0x2e 0x29 0x29 0x29 0x29 0x29 0x29
+ cvt f64 $r8d f32 0x3f800000
+ // 4 Newton-Raphson Steps, tmp in $r4d, result in $r0d
+ // The formula used here (and above) is:
+ // RCP_{n + 1} = 2 * RCP_{n} - x * RCP_{n} * RCP_{n}
+ // The following code uses 2 FMAs for each step, and it will basically
+ // looks like:
+ // tmp = -src * RCP_{n} + 1
+ // RCP_{n + 1} = RCP_{n} * tmp + RCP_{n}
+ fma rn f64 $r4d $r6d $r0d $r8d
+ fma rn f64 $r0d $r0d $r4d $r0d
+ fma rn f64 $r4d $r6d $r0d $r8d
+ fma rn f64 $r0d $r0d $r4d $r0d
+ fma rn f64 $r4d $r6d $r0d $r8d
+ fma rn f64 $r0d $r0d $r4d $r0d
+ sched 0x29 0x20 0x28 0x28 0x28 0x28 0x28
+ fma rn f64 $r4d $r6d $r0d $r8d
+ fma rn f64 $r0d $r0d $r4d $r0d
+ // Step 5: Exponent recovery and final processing
+ // The exponent is recovered by adding what we added to the exponent.
+ // Suppose we want to calculate rcp(x), but we have rcp(cx), then
+ // rcp(x) = c * rcp(cx)
+ // The delta in exponent comes from two sources:
+ // 1) The renormalization in step 2. The delta is:
+ // 0x3ff - $r2
+ // 2) (For the denorm input) The 2^54 we multiplied at rcp_denorm, stored
+ // in $r3
+ // These 2 sources are calculated in the first two lines below, and then
+ // added to the exponent extracted from the result above.
+ // Note that after processing, the new exponent may >= 0x7ff (inf)
+ // or <= 0 (denorm). Those cases will be handled respectively below
+ subr b32 $r2 $r2 0x3ff
+ add b32 $r4 $r2 $r3
+ ext u32 $r3 $r1 0xb14
+ // New exponent in $r3
+ add b32 $r3 $r3 $r4
+ add b32 $r2 $r3 0xffffffff
+ sched 0x28 0x2b 0x28 0x2b 0x28 0x28 0x2b
+ // (exponent-1) < 0x7fe (unsigned) means the result is in norm range
+ // (same logic as in step 1)
+ set b32 $p0 0x1 lt u32 $r2 0x7fe
+ (not $p0) bra #rcp_result_inf_or_denorm
+ // Norms: convert exponents back and return
+ shl b32 $r4 $r4 clamp 0x14
+ add b32 $r1 $r4 $r1
+ bra #rcp_end
+rcp_result_inf_or_denorm:
+ // New exponent >= 0x7ff means that result is inf
+ set b32 $p0 0x1 ge s32 $r3 0x7ff
+ (not $p0) bra #rcp_result_denorm
+ sched 0x20 0x25 0x28 0x2b 0x23 0x25 0x2f
+ // Infinity
+ and b32 $r1 $r1 0x80000000
+ mov b32 $r0 0x0
+ add b32 $r1 $r1 0x7ff00000
+ bra #rcp_end
+rcp_result_denorm:
+ // Denorm result comes from huge input. The greatest possible fp64, i.e.
+ // 0x7fefffffffffffff's rcp is 0x0004000000000000, 1/4 of the smallest
+ // normal value. Other rcp result should be greater than that. If we
+ // set the exponent field to 1, we can recover the result by multiplying
+ // it with 1/2 or 1/4. 1/2 is used if the "exponent" $r3 is 0, otherwise
+ // 1/4 ($r3 should be -1 then). This is quite tricky but greatly simplifies
+ // the logic here.
+ set b32 $p0 0x1 ne u32 $r3 0x0
+ and b32 $r1 $r1 0x800fffff
+ // 0x3e800000: 1/4
+ $p0 cvt f64 $r6d f32 0x3e800000
+ sched 0x2f 0x28 0x2c 0x2e 0x2a 0x20 0x27
+ // 0x3f000000: 1/2
+ (not $p0) cvt f64 $r6d f32 0x3f000000
+ add b32 $r1 $r1 0x00100000
+ mul rn f64 $r0d $r0d $r6d
+rcp_end:
+ ret
+
+// RSQ F64
+//
+// INPUT: $r0d
+// OUTPUT: $r0d
+// CLOBBER: $r2 - $r9, $p0 - $p1
+//
gk110_rsq_f64:
+ // Before getting initial result rsqrt64h, two special cases should be
+ // handled first.
+ // 1. NaN: set the highest bit in mantissa so it'll be surely recognized
+ // as NaN in rsqrt64h
+ set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000
+ $p0 or b32 $r1 $r1 0x00080000
+ and b32 $r2 $r1 0x7fffffff
+ sched 0x27 0x20 0x28 0x2c 0x25 0x28 0x28
+ // 2. denorms and small normal values: using their original value will
+ // lose precision either at rsqrt64h or the first step in newton-raphson
+ // steps below. Take 2 as a threshold in exponent field, and multiply
+ // with 2^54 if the exponent is smaller or equal. (will multiply 2^27
+ // to recover in the end)
+ ext u32 $r3 $r1 0xb14
+ set b32 $p1 0x1 le u32 $r3 0x2
+ or b32 $r2 $r0 $r2
+ $p1 mul rn f64 $r0d $r0d 0x4350000000000000
+ rsqrt64h f32 $r5 $r1
+ // rsqrt64h will give correct result for 0/inf/nan, the following logic
+ // checks whether the input is one of those (exponent is 0x7ff or all 0
+ // except for the sign bit)
+ set b32 $r6 ne u32 $r3 0x7ff
+ and b32 $r2 $r2 $r6
+ sched 0x28 0x2b 0x20 0x27 0x28 0x2e 0x28
+ set b32 $p0 0x1 ne u32 $r2 0x0
+ $p0 bra #rsq_norm
+ // For 0/inf/nan, make sure the sign bit agrees with input and return
+ and b32 $r1 $r1 0x80000000
+ mov b32 $r0 0x0
+ or b32 $r1 $r1 $r5
+ ret
+rsq_norm:
+ // For others, do 4 Newton-Raphson steps with the formula:
+ // RSQ_{n + 1} = RSQ_{n} * (1.5 - 0.5 * x * RSQ_{n} * RSQ_{n})
+ // In the code below, each step is written as:
+ // tmp1 = 0.5 * x * RSQ_{n}
+ // tmp2 = -RSQ_{n} * tmp1 + 0.5
+ // RSQ_{n + 1} = RSQ_{n} * tmp2 + RSQ_{n}
+ mov b32 $r4 0x0
+ sched 0x2f 0x29 0x29 0x29 0x29 0x29 0x29
+ // 0x3f000000: 1/2
+ cvt f64 $r8d f32 0x3f000000
+ mul rn f64 $r2d $r0d $r8d
+ mul rn f64 $r0d $r2d $r4d
+ fma rn f64 $r6d neg $r4d $r0d $r8d
+ fma rn f64 $r4d $r4d $r6d $r4d
+ mul rn f64 $r0d $r2d $r4d
+ fma rn f64 $r6d neg $r4d $r0d $r8d
+ sched 0x29 0x29 0x29 0x29 0x29 0x29 0x29
+ fma rn f64 $r4d $r4d $r6d $r4d
+ mul rn f64 $r0d $r2d $r4d
+ fma rn f64 $r6d neg $r4d $r0d $r8d
+ fma rn f64 $r4d $r4d $r6d $r4d
+ mul rn f64 $r0d $r2d $r4d
+ fma rn f64 $r6d neg $r4d $r0d $r8d
+ fma rn f64 $r4d $r4d $r6d $r4d
+ sched 0x29 0x20 0x28 0x2e 0x00 0x00 0x00
+ // Multiply 2^27 to result for small inputs to recover
+ $p1 mul rn f64 $r4d $r4d 0x41a0000000000000
+ mov b32 $r1 $r5
+ mov b32 $r0 $r4
ret
.section #gk110_builtin_offsets
diff --git a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h
index 8d00e2a2245..3d1523f2fdd 100644
--- a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h
+++ b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h
@@ -65,11 +65,132 @@ uint64_t gk110_builtin_code[] = {
0xe088000001000406,
0x4000000000800001,
0xe6010000000ce802,
- 0x08b08010a010b810,
+ 0x08a0a0a010a0b810,
0xe60100000088e806,
0x19000000001c003c,
/* 0x0218: gk110_rcp_f64 */
-/* 0x0218: gk110_rsq_f64 */
+ 0xc00000058a1c0409,
+ 0x407fffffff9c080d,
+ 0x1480000050000000,
+ 0xb3401c03fe9c0c1d,
+ 0xe4c03c007f9c000e,
+ 0x08a0a0bcacb410bc,
+ 0x8580000000603c02,
+ 0x747fffffff9fc00e,
+ 0xb4601fff801c021d,
+ 0x120000000420003c,
+ 0x21000400005c0404,
+/* 0x0270: rcp_inf_or_denorm_or_zero */
+ 0x203ff800001c0410,
+ 0xb3281c00001c101d,
+ 0x0880bcacb4bc10ac,
+ 0x120000000800003c,
+ 0x223ff800001c0404,
+ 0xe4c03c007fdc0002,
+/* 0x02a0: rcp_denorm_or_zero */
+ 0xb4601c00001c021d,
+ 0x120000000400003c,
+ 0x213ff800005c0404,
+/* 0x02b8: rcp_denorm */
+ 0xc400021a801c0001,
+ 0x08a010a0a0aca0bc,
+ 0x740000001b5fc00e,
+/* 0x02d0: rcp_rejoin */
+ 0xb3181c00001c0c1d,
+ 0x12000000c000003c,
+ 0xc00000058a1c0409,
+ 0x204007ffff9c041c,
+ 0x401ff800001c1c1d,
+ 0xe4c03c00001c001a,
+ 0x08b8aca8a0a010ac,
+ 0xe5400c00031c3816,
+ 0x84000000021c1412,
+ 0x745fc000001fc002,
+ 0xcc000000029c1016,
+ 0xcc081000029c1002,
+ 0xe5400000001c2c02,
+ 0xe5410000031c3c1a,
+ 0x08a4a4a4a4a4a4b8,
+ 0xc54001fc001c2c21,
+ 0xdb802000001c1812,
+ 0xdb800000021c0002,
+ 0xdb802000001c1812,
+ 0xdb800000021c0002,
+ 0xdb802000001c1812,
+ 0xdb800000021c0002,
+ 0x08a0a0a0a0a080a4,
+ 0xdb802000001c1812,
+ 0xdb800000021c0002,
+ 0x48000001ff9c0809,
+ 0xe0800000019c0812,
+ 0xc00000058a1c040d,
+ 0xe0800000021c0c0e,
+ 0x407fffffff9c0c09,
+ 0x08aca0a0aca0aca0,
+ 0xb3101c03ff1c081d,
+ 0x120000000c20003c,
+ 0xc24000000a1c1011,
+ 0xe0800000009c1006,
+ 0x12000000381c003c,
+/* 0x03f0: rcp_result_inf_or_denorm */
+ 0xb3681c03ff9c0c1d,
+ 0x120000001420003c,
+ 0x08bc948caca09480,
+ 0x20400000001c0404,
+ 0xe4c03c007f9c0002,
+ 0x403ff800001c0405,
+ 0x120000001c1c003c,
+/* 0x0428: rcp_result_denorm */
+ 0xb3501c00001c0c1d,
+ 0x204007ffff9c0404,
+ 0xc54001f400002c19,
+ 0x089c80a8b8b0a0bc,
+ 0xc54001f800202c19,
+ 0x40000800001c0405,
+ 0xe4000000031c0002,
+/* 0x0460: rcp_end */
+ 0x19000000001c003c,
+/* 0x0468: gk110_rsq_f64 */
+ 0xb4601fff801c021d,
+ 0x2100040000000404,
+ 0x203fffffff9c0408,
+ 0x08a0a094b0a0809c,
+ 0xc00000058a1c040d,
+ 0xb3301c00011c0c3d,
+ 0xe2001000011c000a,
+ 0xc400021a80040001,
+ 0x84000000039c0416,
+ 0xb2d01c03ff9c0c19,
+ 0xe2000000031c080a,
+ 0x08a0b8a09c80aca0,
+ 0xb3501c00001c081d,
+ 0x120000001000003c,
+ 0x20400000001c0404,
+ 0xe4c03c007f9c0002,
+ 0xe2001000029c0406,
+ 0x19000000001c003c,
+/* 0x04f8: rsq_norm */
+ 0xe4c03c007f9c0012,
+ 0x08a4a4a4a4a4a4bc,
+ 0xc54001f8001c2c21,
+ 0xe4000000041c000a,
+ 0xe4000000021c0802,
+ 0xdb882000001c101a,
+ 0xdb801000031c1012,
+ 0xe4000000021c0802,
+ 0xdb882000001c101a,
+ 0x08a4a4a4a4a4a4a4,
+ 0xdb801000031c1012,
+ 0xe4000000021c0802,
+ 0xdb882000001c101a,
+ 0xdb801000031c1012,
+ 0xe4000000021c0802,
+ 0xdb882000001c101a,
+ 0xdb801000031c1012,
+ 0x08000000b8a080a4,
+ 0xc400020d00041011,
+ 0xe4c03c00029c0006,
+ 0xe4c03c00021c0002,
0x19000000001c003c,
};
@@ -77,5 +198,5 @@ uint64_t gk110_builtin_offsets[] = {
0x0000000000000000,
0x00000000000000f0,
0x0000000000000218,
- 0x0000000000000218,
+ 0x0000000000000468,
};
diff --git a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm
index 7ee5f8fc65b..faee0218d18 100644
--- a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm
+++ b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm
@@ -100,10 +100,253 @@ gm107_div_s32:
ret
nop 0
-// STUB
+// RCP F64
+//
+// INPUT: $r0d
+// OUTPUT: $r0d
+// CLOBBER: $r2 - $r9, $p0
+//
+// The core of RCP and RSQ implementation is Newton-Raphson step, which is
+// used to find successively better approximation from an imprecise initial
+// value (single precision rcp in RCP and rsqrt64h in RSQ).
+//
gm107_rcp_f64:
-gm107_rsq_f64:
+ // Step 1: classify input according to exponent and value, and calculate
+ // result for 0/inf/nan. $r2 holds the exponent value, which starts at
+ // bit 52 (bit 20 of the upper half) and is 11 bits in length
+ sched (st 0x0) (st 0x0) (st 0x0)
+ bfe u32 $r2 $r1 0xb14
+ iadd32i $r3 $r2 -1
+ ssy #rcp_rejoin
+ // We want to check whether the exponent is 0 or 0x7ff (i.e. NaN, inf,
+ // denorm, or 0). Do this by substracting 1 from the exponent, which will
+ // mean that it's > 0x7fd in those cases when doing unsigned comparison
+ sched (st 0x0) (st 0x0) (st 0x0)
+ isetp gt u32 and $p0 1 $r3 0x7fd 1
+ // $r3: 0 for norms, 0x36 for denorms, -1 for others
+ mov $r3 0x0 0xf
+ not $p0 sync
+ // Process all special values: NaN, inf, denorm, 0
+ sched (st 0x0) (st 0x0) (st 0x0)
+ mov32i $r3 0xffffffff 0xf
+ // A number is NaN if its abs value is greater than or unordered with inf
+ dsetp gtu and $p0 1 abs $r0 0x7ff0000000000000 1
+ not $p0 bra #rcp_inf_or_denorm_or_zero
+ // NaN -> NaN, the next line sets the "quiet" bit of the result. This
+ // behavior is both seen on the CPU and the blob
+ sched (st 0x0) (st 0x0) (st 0x0)
+ lop32i or $r1 $r1 0x80000
+ sync
+rcp_inf_or_denorm_or_zero:
+ lop32i and $r4 $r1 0x7ff00000
+ sched (st 0x0) (st 0x0) (st 0x0)
+ // Other values with nonzero in exponent field should be inf
+ isetp eq and $p0 1 $r4 0x0 1
+ $p0 bra #rcp_denorm_or_zero
+ // +/-Inf -> +/-0
+ lop32i xor $r1 $r1 0x7ff00000
+ sched (st 0x0) (st 0x0) (st 0x0)
+ mov $r0 0x0 0xf
+ sync
+rcp_denorm_or_zero:
+ dsetp gtu and $p0 1 abs $r0 0x0 1
+ sched (st 0x0) (st 0x0) (st 0x0)
+ $p0 bra #rcp_denorm
+ // +/-0 -> +/-Inf
+ lop32i or $r1 $r1 0x7ff00000
+ sync
+rcp_denorm:
+ // non-0 denorms: multiply with 2^54 (the 0x36 in $r3), join with norms
+ sched (st 0x0) (st 0x0) (st 0x0)
+ dmul $r0 $r0 0x4350000000000000
+ mov $r3 0x36 0xf
+ sync
+rcp_rejoin:
+ // All numbers with -1 in $r3 have their result ready in $r0d, return them
+ // others need further calculation
+ sched (st 0x0) (st 0x0) (st 0x0)
+ isetp lt and $p0 1 $r3 0x0 1
+ $p0 bra #rcp_end
+ // Step 2: Before the real calculation goes on, renormalize the values to
+ // range [1, 2) by setting exponent field to 0x3ff (the exponent of 1)
+ // result in $r6d. The exponent will be recovered later.
+ bfe u32 $r2 $r1 0xb14
+ sched (st 0x0) (st 0x0) (st 0x0)
+ lop32i and $r7 $r1 0x800fffff
+ iadd32i $r7 $r7 0x3ff00000
+ mov $r6 $r0 0xf
+ // Step 3: Convert new value to float (no overflow will occur due to step
+ // 2), calculate rcp and do newton-raphson step once
+ sched (st 0x0) (st 0x0) (st 0x0)
+ f2f ftz f64 f32 $r5 $r6
+ mufu rcp $r4 $r5
+ mov32i $r0 0xbf800000 0xf
+ sched (st 0x0) (st 0x0) (st 0x0)
+ ffma $r5 $r4 $r5 $r0
+ ffma $r0 $r5 neg $r4 $r4
+ // Step 4: convert result $r0 back to double, do newton-raphson steps
+ f2f f32 f64 $r0 $r0
+ sched (st 0x0) (st 0x0) (st 0x0)
+ f2f f64 f64 $r6 neg $r6
+ f2f f32 f64 $r8 0x3f800000
+ // 4 Newton-Raphson Steps, tmp in $r4d, result in $r0d
+ // The formula used here (and above) is:
+ // RCP_{n + 1} = 2 * RCP_{n} - x * RCP_{n} * RCP_{n}
+ // The following code uses 2 FMAs for each step, and it will basically
+ // looks like:
+ // tmp = -src * RCP_{n} + 1
+ // RCP_{n + 1} = RCP_{n} * tmp + RCP_{n}
+ dfma $r4 $r6 $r0 $r8
sched (st 0x0) (st 0x0) (st 0x0)
+ dfma $r0 $r0 $r4 $r0
+ dfma $r4 $r6 $r0 $r8
+ dfma $r0 $r0 $r4 $r0
+ sched (st 0x0) (st 0x0) (st 0x0)
+ dfma $r4 $r6 $r0 $r8
+ dfma $r0 $r0 $r4 $r0
+ dfma $r4 $r6 $r0 $r8
+ sched (st 0x0) (st 0x0) (st 0x0)
+ dfma $r0 $r0 $r4 $r0
+ // Step 5: Exponent recovery and final processing
+ // The exponent is recovered by adding what we added to the exponent.
+ // Suppose we want to calculate rcp(x), but we have rcp(cx), then
+ // rcp(x) = c * rcp(cx)
+ // The delta in exponent comes from two sources:
+ // 1) The renormalization in step 2. The delta is:
+ // 0x3ff - $r2
+ // 2) (For the denorm input) The 2^54 we multiplied at rcp_denorm, stored
+ // in $r3
+ // These 2 sources are calculated in the first two lines below, and then
+ // added to the exponent extracted from the result above.
+ // Note that after processing, the new exponent may >= 0x7ff (inf)
+ // or <= 0 (denorm). Those cases will be handled respectively below
+ iadd $r2 neg $r2 0x3ff
+ iadd $r4 $r2 $r3
+ sched (st 0x0) (st 0x0) (st 0x0)
+ bfe u32 $r3 $r1 0xb14
+ // New exponent in $r3
+ iadd $r3 $r3 $r4
+ iadd32i $r2 $r3 -1
+ // (exponent-1) < 0x7fe (unsigned) means the result is in norm range
+ // (same logic as in step 1)
+ sched (st 0x0) (st 0x0) (st 0x0)
+ isetp lt u32 and $p0 1 $r2 0x7fe 1
+ not $p0 bra #rcp_result_inf_or_denorm
+ // Norms: convert exponents back and return
+ shl $r4 $r4 0x14
+ sched (st 0x0) (st 0x0) (st 0x0)
+ iadd $r1 $r4 $r1
+ bra #rcp_end
+rcp_result_inf_or_denorm:
+ // New exponent >= 0x7ff means that result is inf
+ isetp ge and $p0 1 $r3 0x7ff 1
+ sched (st 0x0) (st 0x0) (st 0x0)
+ not $p0 bra #rcp_result_denorm
+ // Infinity
+ lop32i and $r1 $r1 0x80000000
+ mov $r0 0x0 0xf
+ sched (st 0x0) (st 0x0) (st 0x0)
+ iadd32i $r1 $r1 0x7ff00000
+ bra #rcp_end
+rcp_result_denorm:
+ // Denorm result comes from huge input. The greatest possible fp64, i.e.
+ // 0x7fefffffffffffff's rcp is 0x0004000000000000, 1/4 of the smallest
+ // normal value. Other rcp result should be greater than that. If we
+ // set the exponent field to 1, we can recover the result by multiplying
+ // it with 1/2 or 1/4. 1/2 is used if the "exponent" $r3 is 0, otherwise
+ // 1/4 ($r3 should be -1 then). This is quite tricky but greatly simplifies
+ // the logic here.
+ isetp ne u32 and $p0 1 $r3 0x0 1
+ sched (st 0x0) (st 0x0) (st 0x0)
+ lop32i and $r1 $r1 0x800fffff
+ // 0x3e800000: 1/4
+ $p0 f2f f32 f64 $r6 0x3e800000
+ // 0x3f000000: 1/2
+ not $p0 f2f f32 f64 $r6 0x3f000000
+ sched (st 0x0) (st 0x0) (st 0x0)
+ iadd32i $r1 $r1 0x00100000
+ dmul $r0 $r0 $r6
+rcp_end:
+ ret
+
+// RSQ F64
+//
+// INPUT: $r0d
+// OUTPUT: $r0d
+// CLOBBER: $r2 - $r9, $p0 - $p1
+//
+gm107_rsq_f64:
+ // Before getting initial result rsqrt64h, two special cases should be
+ // handled first.
+ // 1. NaN: set the highest bit in mantissa so it'll be surely recognized
+ // as NaN in rsqrt64h
+ sched (st 0xd wr 0x0 wt 0x3f) (st 0xd wt 0x1) (st 0xd)
+ dsetp gtu and $p0 1 abs $r0 0x7ff0000000000000 1
+ $p0 lop32i or $r1 $r1 0x00080000
+ lop32i and $r2 $r1 0x7fffffff
+ // 2. denorms and small normal values: using their original value will
+ // lose precision either at rsqrt64h or the first step in newton-raphson
+ // steps below. Take 2 as a threshold in exponent field, and multiply
+ // with 2^54 if the exponent is smaller or equal. (will multiply 2^27
+ // to recover in the end)
+ sched (st 0xd) (st 0xd) (st 0xd)
+ bfe u32 $r3 $r1 0xb14
+ isetp le u32 and $p1 1 $r3 0x2 1
+ lop or 1 $r2 $r0 $r2
+ sched (st 0xd wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xd)
+ $p1 dmul $r0 $r0 0x4350000000000000
+ mufu rsq64h $r5 $r1
+ // rsqrt64h will give correct result for 0/inf/nan, the following logic
+ // checks whether the input is one of those (exponent is 0x7ff or all 0
+ // except for the sign bit)
+ iset ne u32 and $r6 $r3 0x7ff 1
+ sched (st 0xd) (st 0xd) (st 0xd)
+ lop and 1 $r2 $r2 $r6
+ isetp ne u32 and $p0 1 $r2 0x0 1
+ $p0 bra #rsq_norm
+ // For 0/inf/nan, make sure the sign bit agrees with input and return
+ sched (st 0xd) (st 0xd) (st 0xd wt 0x1)
+ lop32i and $r1 $r1 0x80000000
+ mov $r0 0x0 0xf
+ lop or 1 $r1 $r1 $r5
+ sched (st 0xd) (st 0xf) (st 0xf)
+ ret
+ nop 0
+ nop 0
+rsq_norm:
+ // For others, do 4 Newton-Raphson steps with the formula:
+ // RSQ_{n + 1} = RSQ_{n} * (1.5 - 0.5 * x * RSQ_{n} * RSQ_{n})
+ // In the code below, each step is written as:
+ // tmp1 = 0.5 * x * RSQ_{n}
+ // tmp2 = -RSQ_{n} * tmp1 + 0.5
+ // RSQ_{n + 1} = RSQ_{n} * tmp2 + RSQ_{n}
+ sched (st 0xd) (st 0xd wr 0x1) (st 0xd wr 0x1 rd 0x0 wt 0x3)
+ mov $r4 0x0 0xf
+ // 0x3f000000: 1/2
+ f2f f32 f64 $r8 0x3f000000
+ dmul $r2 $r0 $r8
+ sched (st 0xd wr 0x0 wt 0x3) (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1)
+ dmul $r0 $r2 $r4
+ dfma $r6 $r0 neg $r4 $r8
+ dfma $r4 $r4 $r6 $r4
+ sched (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1)
+ dmul $r0 $r2 $r4
+ dfma $r6 $r0 neg $r4 $r8
+ dfma $r4 $r4 $r6 $r4
+ sched (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1)
+ dmul $r0 $r2 $r4
+ dfma $r6 $r0 neg $r4 $r8
+ dfma $r4 $r4 $r6 $r4
+ sched (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1)
+ dmul $r0 $r2 $r4
+ dfma $r6 $r0 neg $r4 $r8
+ dfma $r4 $r4 $r6 $r4
+ // Multiply 2^27 to result for small inputs to recover
+ sched (st 0xd wr 0x0 wt 0x1) (st 0xd wt 0x1) (st 0xd)
+ $p1 dmul $r4 $r4 0x41a0000000000000
+ mov $r1 $r5 0xf
+ mov $r0 $r4 0xf
+ sched (st 0xd) (st 0xf) (st 0xf)
ret
nop 0
nop 0
diff --git a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h
index 65c93f7ae89..8eb27bbac99 100644
--- a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h
+++ b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h
@@ -82,8 +82,156 @@ uint64_t gm107_builtin_code[] = {
0xe32000000007000f,
0x50b0000000070f00,
/* 0x0280: gm107_rcp_f64 */
-/* 0x0280: gm107_rsq_f64 */
0x001f8000fc0007e0,
+ 0x38000000b1470102,
+ 0x1c0ffffffff70203,
+ 0xe29000000e000000,
+ 0x001f8000fc0007e0,
+ 0x366803807fd70307,
+ 0x5c9807800ff70003,
+ 0xf0f800000008000f,
+ 0x001f8000fc0007e0,
+ 0x010ffffffff7f003,
+ 0x368c03fff0070087,
+ 0xe24000000188000f,
+ 0x001f8000fc0007e0,
+ 0x0420008000070101,
+ 0xf0f800000007000f,
+/* 0x02f8: rcp_inf_or_denorm_or_zero */
+ 0x0407ff0000070104,
+ 0x001f8000fc0007e0,
+ 0x5b6503800ff70407,
+ 0xe24000000200000f,
+ 0x0447ff0000070101,
+ 0x001f8000fc0007e0,
+ 0x5c9807800ff70000,
+ 0xf0f800000007000f,
+/* 0x0338: rcp_denorm_or_zero */
+ 0x5b8c03800ff70087,
+ 0x001f8000fc0007e0,
+ 0xe24000000100000f,
+ 0x0427ff0000070101,
+ 0xf0f800000007000f,
+/* 0x0360: rcp_denorm */
+ 0x001f8000fc0007e0,
+ 0x3880004350070000,
+ 0x3898078003670003,
+ 0xf0f800000007000f,
+/* 0x0380: rcp_rejoin */
+ 0x001f8000fc0007e0,
+ 0x5b6303800ff70307,
+ 0xe24000001c00000f,
+ 0x38000000b1470102,
+ 0x001f8000fc0007e0,
+ 0x040800fffff70107,
+ 0x1c03ff0000070707,
+ 0x5c98078000070006,
+ 0x001f8000fc0007e0,
+ 0x5ca8100000670e05,
+ 0x5080000000470504,
+ 0x010bf8000007f000,
+ 0x001f8000fc0007e0,
+ 0x5980000000570405,
+ 0x5981020000470500,
+ 0x5ca8000000070b00,
+ 0x001f8000fc0007e0,
+ 0x5ca8200000670f06,
+ 0x38a8003f80070b08,
+ 0x5b70040000070604,
+ 0x001f8000fc0007e0,
+ 0x5b70000000470000,
+ 0x5b70040000070604,
+ 0x5b70000000470000,
+ 0x001f8000fc0007e0,
+ 0x5b70040000070604,
+ 0x5b70000000470000,
+ 0x5b70040000070604,
+ 0x001f8000fc0007e0,
+ 0x5b70000000470000,
+ 0x381200003ff70202,
+ 0x5c10000000370204,
+ 0x001f8000fc0007e0,
+ 0x38000000b1470103,
+ 0x5c10000000470303,
+ 0x1c0ffffffff70302,
+ 0x001f8000fc0007e0,
+ 0x366203807fe70207,
+ 0xe24000000208000f,
+ 0x3848000001470404,
+ 0x001f8000fc0007e0,
+ 0x5c10000000170401,
+ 0xe24000000807000f,
+/* 0x04d8: rcp_result_inf_or_denorm */
+ 0x366d03807ff70307,
+ 0x001f8000fc0007e0,
+ 0xe24000000288000f,
+ 0x0408000000070101,
+ 0x5c9807800ff70000,
+ 0x001f8000fc0007e0,
+ 0x1c07ff0000070101,
+ 0xe24000000407000f,
+/* 0x0518: rcp_result_denorm */
+ 0x5b6a03800ff70307,
+ 0x001f8000fc0007e0,
+ 0x040800fffff70101,
+ 0x38a8003e80000b06,
+ 0x38a8003f00080b06,
+ 0x001f8000fc0007e0,
+ 0x1c00010000070101,
+ 0x5c80000000670000,
+/* 0x0558: rcp_end */
+ 0xe32000000007000f,
+/* 0x0560: gm107_rsq_f64 */
+ 0x001fb401fda1ff0d,
+ 0x368c03fff0070087,
+ 0x0420008000000101,
+ 0x0407fffffff70102,
+ 0x001fb400fda007ed,
+ 0x38000000b1470103,
+ 0x366603800027030f,
+ 0x5c47020000270002,
+ 0x001fb401e1a0070d,
+ 0x3880004350010000,
+ 0x5080000000770105,
+ 0x365a03807ff70306,
+ 0x001fb400fda007ed,
+ 0x5c47000000670202,
+ 0x5b6a03800ff70207,
+ 0xe24000000400000f,
+ 0x003fb400fda007ed,
+ 0x0408000000070101,
+ 0x5c9807800ff70000,
+ 0x5c47020000570101,
+ 0x001fbc00fde007ed,
+ 0xe32000000007000f,
+ 0x50b0000000070f00,
+ 0x50b0000000070f00,
+/* 0x0620: rsq_norm */
+ 0x0060b400e5a007ed,
+ 0x5c9807800ff70004,
+ 0x38a8003f00070b08,
+ 0x5c80000000870002,
+ 0x003c3401e1a01f0d,
+ 0x5c80000000470200,
+ 0x5b71040000470006,
+ 0x5b70020000670404,
+ 0x003c3401e1a00f0d,
+ 0x5c80000000470200,
+ 0x5b71040000470006,
+ 0x5b70020000670404,
+ 0x003c3401e1a00f0d,
+ 0x5c80000000470200,
+ 0x5b71040000470006,
+ 0x5b70020000670404,
+ 0x003c3401e1a00f0d,
+ 0x5c80000000470200,
+ 0x5b71040000470006,
+ 0x5b70020000670404,
+ 0x001fb401fda00f0d,
+ 0x38800041a0010404,
+ 0x5c98078000570001,
+ 0x5c98078000470000,
+ 0x001fbc00fde007ed,
0xe32000000007000f,
0x50b0000000070f00,
0x50b0000000070f00,
@@ -93,5 +241,5 @@ uint64_t gm107_builtin_offsets[] = {
0x0000000000000000,
0x0000000000000120,
0x0000000000000280,
- 0x0000000000000280,
+ 0x0000000000000560,
};
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
index 49425b98b91..993d01c1e44 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
@@ -1119,6 +1119,7 @@ Program::Program(Type type, Target *arch)
binSize = 0;
maxGPR = -1;
+ fp64 = false;
main = new Function(this, "MAIN", ~0);
calls.insert(&main->call);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
index 8085bb2f542..8d32a25ec23 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
@@ -1311,6 +1311,7 @@ class Program
uint32_t tlsSize; // size required for FILE_MEMORY_LOCAL
int maxGPR;
+ bool fp64;
MemoryPool mem_Instruction;
MemoryPool mem_CmpInstruction;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index afd7916a321..335e708c5cb 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -1087,6 +1087,8 @@ class Source
};
std::vector memoryFiles;
+ std::vector bufferAtomics;
+
private:
int inferSysValDirection(unsigned sn) const;
bool scanDeclaration(const struct tgsi_full_declaration *);
@@ -1137,6 +1139,7 @@ bool Source::scanSource()
//resources.resize(scan.file_max[TGSI_FILE_RESOURCE] + 1);
tempArrayId.resize(scan.file_max[TGSI_FILE_TEMPORARY] + 1);
memoryFiles.resize(scan.file_max[TGSI_FILE_MEMORY] + 1);
+ bufferAtomics.resize(scan.file_max[TGSI_FILE_BUFFER] + 1);
info->immd.bufSize = 0;
@@ -1483,11 +1486,14 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
tempArrayInfo.insert(std::make_pair(arrayId, std::make_pair(
first, last - first + 1)));
break;
+ case TGSI_FILE_BUFFER:
+ for (i = first; i <= last; ++i)
+ bufferAtomics[i] = decl->Declaration.Atomic;
+ break;
case TGSI_FILE_ADDRESS:
case TGSI_FILE_CONSTANT:
case TGSI_FILE_IMMEDIATE:
case TGSI_FILE_SAMPLER:
- case TGSI_FILE_BUFFER:
case TGSI_FILE_IMAGE:
break;
default:
@@ -2720,7 +2726,11 @@ Converter::handleLOAD(Value *dst0[4])
}
Instruction *ld = mkLoad(TYPE_U32, dst0[c], sym, off);
- ld->cache = tgsi.getCacheMode();
+ if (tgsi.getSrc(0).getFile() == TGSI_FILE_BUFFER &&
+ code->bufferAtomics[r])
+ ld->cache = nv50_ir::CACHE_CG;
+ else
+ ld->cache = tgsi.getCacheMode();
if (ind)
ld->setIndirect(0, 1, ind);
}
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index 295497be2f9..346a98228bd 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -83,6 +83,38 @@ NVC0LegalizeSSA::handleDIV(Instruction *i)
delete_Instruction(prog, i);
}
+void
+NVC0LegalizeSSA::handleRCPRSQLib(Instruction *i, Value *src[])
+{
+ FlowInstruction *call;
+ Value *def[2];
+ int builtin;
+
+ def[0] = bld.mkMovToReg(0, src[0])->getDef(0);
+ def[1] = bld.mkMovToReg(1, src[1])->getDef(0);
+
+ if (i->op == OP_RCP)
+ builtin = NVC0_BUILTIN_RCP_F64;
+ else
+ builtin = NVC0_BUILTIN_RSQ_F64;
+
+ call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL);
+ def[0] = bld.getSSA();
+ def[1] = bld.getSSA();
+ bld.mkMovFromReg(def[0], 0);
+ bld.mkMovFromReg(def[1], 1);
+ bld.mkClobber(FILE_GPR, 0x3fc, 2);
+ bld.mkClobber(FILE_PREDICATE, i->op == OP_RSQ ? 0x3 : 0x1, 0);
+ bld.mkOp2(OP_MERGE, TYPE_U64, i->getDef(0), def[0], def[1]);
+
+ call->fixed = 1;
+ call->absolute = call->builtin = 1;
+ call->target.builtin = builtin;
+ delete_Instruction(prog, i);
+
+ prog->fp64 = true;
+}
+
void
NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
{
@@ -96,6 +128,12 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
Value *src[2], *dst[2], *def = i->getDef(0);
bld.mkSplit(src, 4, i->getSrc(0));
+ int chip = prog->getTarget()->getChipset();
+ if (chip >= NVISA_GK104_CHIPSET) {
+ handleRCPRSQLib(i, src);
+ return;
+ }
+
// 2. We don't care about the low 32 bits of the destination. Stick a 0 in.
dst[0] = bld.loadImm(NULL, 0);
dst[1] = bld.getSSA();
@@ -1063,22 +1101,6 @@ NVC0LoweringPass::handleTEX(TexInstruction *i)
}
}
- if (chipset >= NVISA_GK104_CHIPSET) {
- //
- // If TEX requires more than 4 sources, the 2nd register tuple must be
- // aligned to 4, even if it consists of just a single 4-byte register.
- //
- // XXX HACK: We insert 0 sources to avoid the 5 or 6 regs case.
- //
- int s = i->srcCount(0xff, true);
- if (s > 4 && s < 7) {
- if (i->srcExists(s)) // move potential predicate out of the way
- i->moveSources(s, 7 - s);
- while (s < 7)
- i->setSrc(s++, bld.loadImm(NULL, 0));
- }
- }
-
return true;
}
@@ -1887,7 +1909,8 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
su->op == OP_SULDB || su->op == OP_SUSTB || su->op == OP_SUREDB;
const int slot = su->tex.r;
const int dim = su->tex.target.getDim();
- const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
+ const bool array = su->tex.target.isArray() || su->tex.target.isCube();
+ const int arg = dim + array;
int c;
Value *zero = bld.mkImm(0);
Value *p1 = NULL;
@@ -1896,6 +1919,7 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
Value *bf, *eau, *off;
Value *addr, *pred;
Value *ind = su->getIndirectR();
+ Value *y, *z;
off = bld.getScratch(4);
bf = bld.getScratch(4);
@@ -1926,34 +1950,42 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
for (; c < 3; ++c)
src[c] = zero;
+ if (dim == 2 && !array) {
+ v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);
+ src[2] = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(),
+ v, bld.loadImm(NULL, 16));
+
+ v = loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM(2), su->tex.bindless);
+ bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[2], src[2], v, zero)
+ ->subOp = NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
+ }
+
// set predicate output
if (su->tex.target == TEX_TARGET_BUFFER) {
src[0]->getInsn()->setFlagsDef(1, pred);
} else
- if (su->tex.target.isArray() || su->tex.target.isCube()) {
+ if (array) {
p1 = bld.getSSA(1, FILE_PREDICATE);
src[dim]->getInsn()->setFlagsDef(1, p1);
}
// calculate pixel offset
if (dim == 1) {
+ y = z = zero;
if (su->tex.target != TEX_TARGET_BUFFER)
bld.mkOp2(OP_AND, TYPE_U32, off, src[0], bld.loadImm(NULL, 0xffff));
- } else
- if (dim == 3) {
+ } else {
+ y = src[1];
+ z = src[2];
+
v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);
bld.mkOp3(OP_MADSP, TYPE_U32, off, src[2], v, src[1])
- ->subOp = NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
+ ->subOp = NV50_IR_SUBOP_MADSP(4,4,8); // u16l u16l u16l
v = loadSuInfo32(ind, slot, NVC0_SU_INFO_PITCH, su->tex.bindless);
bld.mkOp3(OP_MADSP, TYPE_U32, off, off, v, src[0])
- ->subOp = NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l
- } else {
- assert(dim == 2);
- v = loadSuInfo32(ind, slot, NVC0_SU_INFO_PITCH, su->tex.bindless);
- bld.mkOp3(OP_MADSP, TYPE_U32, off, src[1], v, src[0])
- ->subOp = (su->tex.target.isArray() || su->tex.target.isCube()) ?
- NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
+ ->subOp = array ?
+ NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l
}
// calculate effective address part 1
@@ -1966,19 +1998,15 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
->subOp = NV50_IR_SUBOP_V1(7,6,8|2);
}
} else {
- Value *y = src[1];
- Value *z = src[2];
uint16_t subOp = 0;
switch (dim) {
case 1:
- y = zero;
- z = zero;
break;
case 2:
- z = off;
- if (!su->tex.target.isArray() && !su->tex.target.isCube()) {
- z = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);
+ if (array) {
+ z = off;
+ } else {
subOp = NV50_IR_SUBOP_SUBFM_3D;
}
break;
@@ -2001,7 +2029,7 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
eau = bld.mkOp3v(OP_SUEAU, TYPE_U32, bld.getScratch(4), off, bf, v);
}
// add array layer offset
- if (su->tex.target.isArray() || su->tex.target.isCube()) {
+ if (array) {
v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ARRAY, su->tex.bindless);
if (dim == 1)
bld.mkOp3(OP_MADSP, TYPE_U32, eau, src[1], v, eau)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
index e0f50ab0904..99809726602 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
@@ -62,6 +62,7 @@ class NVC0LegalizeSSA : public Pass
// we want to insert calls to the builtin library only after optimization
void handleDIV(Instruction *); // integer division, modulus
+ void handleRCPRSQLib(Instruction *, Value *[]);
void handleRCPRSQ(Instruction *); // double precision float recip/rsqrt
void handleFTZ(Instruction *);
void handleSET(CmpInstruction *);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
index f4379c137c5..f25bce00884 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
@@ -2341,9 +2341,19 @@ RegAlloc::InsertConstraintsPass::texConstraintGM107(TexInstruction *tex)
if (!tex->tex.target.isArray() && tex->tex.useOffsets)
s++;
}
- n = tex->srcCount(0xff) - s;
+ n = tex->srcCount(0xff, true) - s;
+ // TODO: Is this necessary? Perhaps just has to be aligned to the
+ // level that the first arg is, not necessarily to 4. This
+ // requirement has not been rigorously verified, as it has been on
+ // Kepler.
+ if (n > 0 && n < 3) {
+ if (tex->srcExists(n + s)) // move potential predicate out of the way
+ tex->moveSources(n + s, 3 - n);
+ while (n < 3)
+ tex->setSrc(s + n++, new_LValue(func, FILE_GPR));
+ }
} else {
- s = tex->srcCount(0xff);
+ s = tex->srcCount(0xff, true);
n = 0;
}
@@ -2366,14 +2376,18 @@ RegAlloc::InsertConstraintsPass::texConstraintNVE0(TexInstruction *tex)
} else
if (isTextureOp(tex->op)) {
int n = tex->srcCount(0xff, true);
- if (n > 4) {
- condenseSrcs(tex, 0, 3);
- if (n > 5) // NOTE: first call modified positions already
- condenseSrcs(tex, 4 - (4 - 1), n - 1 - (4 - 1));
- } else
- if (n > 1) {
- condenseSrcs(tex, 0, n - 1);
+ int s = n > 4 ? 4 : n;
+ if (n > 4 && n < 7) {
+ if (tex->srcExists(n)) // move potential predicate out of the way
+ tex->moveSources(n, 7 - n);
+
+ while (n < 7)
+ tex->setSrc(n++, new_LValue(func, FILE_GPR));
}
+ if (s > 1)
+ condenseSrcs(tex, 0, s - 1);
+ if (n > 4)
+ condenseSrcs(tex, 1, n - s);
}
}
@@ -2510,6 +2524,7 @@ RegAlloc::InsertConstraintsPass::insertConstraintMove(Instruction *cst, int s)
assert(cst->getSrc(s)->defs.size() == 1); // still SSA
Instruction *defi = cst->getSrc(s)->defs.front()->getInsn();
+
bool imm = defi->op == OP_MOV &&
defi->src(0).getFile() == FILE_IMMEDIATE;
bool load = defi->op == OP_LOAD &&
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
index 9193a01f189..5c6d0570ae2 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
@@ -399,6 +399,7 @@ Program::emitBinary(struct nv50_ir_prog_info *info)
}
}
}
+ info->io.fp64 |= fp64;
info->bin.relocData = emit->getRelocInfo();
info->bin.fixupData = emit->getFixupInfo();
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index 2b69a8f6968..53551ebc037 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -79,6 +79,9 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
return 2048;
case PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET:
return 8 * 1024 * 1024;
+ case PIPE_CAP_MAX_VARYINGS:
+ return 8;
+
/* supported capabilities */
case PIPE_CAP_ANISOTROPIC_FILTER:
case PIPE_CAP_POINT_SPRITE:
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query.c b/src/gallium/drivers/nouveau/nv50/nv50_query.c
index e30380cd84d..13088ebb5fa 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_query.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query.c
@@ -98,12 +98,10 @@ nv50_render_condition(struct pipe_context *pipe,
case PIPE_QUERY_OCCLUSION_COUNTER:
case PIPE_QUERY_OCCLUSION_PREDICATE:
case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+ if (hq->state == NV50_HW_QUERY_STATE_READY)
+ wait = true;
if (likely(!condition)) {
- if (unlikely(hq->nesting))
- cond = wait ? NV50_3D_COND_MODE_NOT_EQUAL :
- NV50_3D_COND_MODE_ALWAYS;
- else
- cond = NV50_3D_COND_MODE_RES_NON_ZERO;
+ cond = wait ? NV50_3D_COND_MODE_NOT_EQUAL : NV50_3D_COND_MODE_ALWAYS;
} else {
cond = wait ? NV50_3D_COND_MODE_EQUAL : NV50_3D_COND_MODE_ALWAYS;
}
@@ -129,7 +127,7 @@ nv50_render_condition(struct pipe_context *pipe,
PUSH_SPACE(push, 9);
- if (wait) {
+ if (wait && hq->state != NV50_HW_QUERY_STATE_READY) {
BEGIN_NV04(push, SUBC_3D(NV50_GRAPH_SERIALIZE), 1);
PUSH_DATA (push, 0);
}
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c
index ac3e409b2d5..4e74c462235 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c
@@ -29,11 +29,6 @@
#include "nv50/nv50_query_hw_sm.h"
#include "nv_object.xml.h"
-#define NV50_HW_QUERY_STATE_READY 0
-#define NV50_HW_QUERY_STATE_ACTIVE 1
-#define NV50_HW_QUERY_STATE_ENDED 2
-#define NV50_HW_QUERY_STATE_FLUSHED 3
-
/* XXX: Nested queries, and simultaneous queries on multiple gallium contexts
* (since we use only a single GPU channel per screen) will not work properly.
*
@@ -158,8 +153,7 @@ nv50_hw_begin_query(struct nv50_context *nv50, struct nv50_query *q)
case PIPE_QUERY_OCCLUSION_COUNTER:
case PIPE_QUERY_OCCLUSION_PREDICATE:
case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
- hq->nesting = nv50->screen->num_occlusion_queries_active++;
- if (hq->nesting) {
+ if (nv50->screen->num_occlusion_queries_active++) {
nv50_hw_query_get(push, q, 0x10, 0x0100f002);
} else {
PUSH_SPACE(push, 4);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.h b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.h
index 82ec6bd2d96..a89a66cec4f 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.h
@@ -6,6 +6,11 @@
#include "nv50_query.h"
+#define NV50_HW_QUERY_STATE_READY 0
+#define NV50_HW_QUERY_STATE_ACTIVE 1
+#define NV50_HW_QUERY_STATE_ENDED 2
+#define NV50_HW_QUERY_STATE_FLUSHED 3
+
#define NVA0_HW_QUERY_STREAM_OUTPUT_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0)
struct nv50_hw_query;
@@ -29,7 +34,6 @@ struct nv50_hw_query {
uint8_t state;
bool is64bit;
uint8_t rotate;
- int nesting; /* only used for occlusion queries */
struct nouveau_mm_allocation *mm;
struct nouveau_fence *fence;
};
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index 0c53b22eb3c..8e65eaf50b1 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -156,6 +156,8 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
return NV50_MAX_WINDOW_RECTANGLES;
case PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET:
return 16 * 1024 * 1024;
+ case PIPE_CAP_MAX_VARYINGS:
+ return 15;
/* supported caps */
case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
@@ -215,6 +217,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
case PIPE_CAP_TGSI_CLOCK:
case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX:
case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION:
+ case PIPE_CAP_DEST_SURFACE_SRGB_CONTROL:
return 1;
case PIPE_CAP_SEAMLESS_CUBE_MAP:
return 1; /* class_3d >= NVA0_3D_CLASS; */
@@ -312,6 +315,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
case PIPE_CAP_TGSI_ATOMFADD:
case PIPE_CAP_QUERY_PIPELINE_STATISTICS_SINGLE:
case PIPE_CAP_RGB_OVERRIDE_DST_ALPHA_BLEND:
+ case PIPE_CAP_GLSL_TESS_LEVELS_AS_INPUTS:
return 0;
case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
index eb057bf2489..c1351062676 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@@ -434,6 +434,7 @@ nvc0_video_buffer_create(struct pipe_context *pipe,
/* nvc0_push.c */
void nvc0_push_vbo(struct nvc0_context *, const struct pipe_draw_info *);
+void nvc0_push_vbo_indirect(struct nvc0_context *, const struct pipe_draw_info *);
/* nve4_compute.c */
void nve4_launch_grid(struct pipe_context *, const struct pipe_grid_info *);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
index 1a3e4e794c0..40af9936859 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
@@ -121,12 +121,10 @@ nvc0_render_condition(struct pipe_context *pipe,
case PIPE_QUERY_OCCLUSION_COUNTER:
case PIPE_QUERY_OCCLUSION_PREDICATE:
case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+ if (hq->state == NVC0_HW_QUERY_STATE_READY)
+ wait = true;
if (likely(!condition)) {
- if (unlikely(hq->nesting))
- cond = wait ? NVC0_3D_COND_MODE_NOT_EQUAL :
- NVC0_3D_COND_MODE_ALWAYS;
- else
- cond = NVC0_3D_COND_MODE_RES_NON_ZERO;
+ cond = wait ? NVC0_3D_COND_MODE_NOT_EQUAL : NVC0_3D_COND_MODE_ALWAYS;
} else {
cond = wait ? NVC0_3D_COND_MODE_EQUAL : NVC0_3D_COND_MODE_ALWAYS;
}
@@ -151,7 +149,7 @@ nvc0_render_condition(struct pipe_context *pipe,
return;
}
- if (wait)
+ if (wait && hq->state != NVC0_HW_QUERY_STATE_READY)
nvc0_hw_query_fifo_wait(nvc0, q);
PUSH_SPACE(push, 10);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
index a420ed4ac0d..f6d5d0f5602 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
@@ -28,11 +28,6 @@
#include "nvc0/nvc0_query_hw_metric.h"
#include "nvc0/nvc0_query_hw_sm.h"
-#define NVC0_HW_QUERY_STATE_READY 0
-#define NVC0_HW_QUERY_STATE_ACTIVE 1
-#define NVC0_HW_QUERY_STATE_ENDED 2
-#define NVC0_HW_QUERY_STATE_FLUSHED 3
-
#define NVC0_HW_QUERY_ALLOC_SPACE 256
bool
@@ -158,14 +153,18 @@ nvc0_hw_begin_query(struct nvc0_context *nvc0, struct nvc0_query *q)
case PIPE_QUERY_OCCLUSION_COUNTER:
case PIPE_QUERY_OCCLUSION_PREDICATE:
case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
- hq->nesting = nvc0->screen->num_occlusion_queries_active++;
- if (hq->nesting) {
+ if (nvc0->screen->num_occlusion_queries_active++) {
nvc0_hw_query_get(push, q, 0x10, 0x0100f002);
} else {
PUSH_SPACE(push, 3);
BEGIN_NVC0(push, NVC0_3D(COUNTER_RESET), 1);
PUSH_DATA (push, NVC0_3D_COUNTER_RESET_SAMPLECNT);
IMMED_NVC0(push, NVC0_3D(SAMPLECNT_ENABLE), 1);
+ /* Given that the counter is reset, the contents at 0x10 are
+ * equivalent to doing the query -- we would get hq->sequence as the
+ * payload and 0 as the reported value. This is already set up above
+ * as in the hq->rotate case.
+ */
}
break;
case PIPE_QUERY_PRIMITIVES_GENERATED:
@@ -199,6 +198,7 @@ nvc0_hw_begin_query(struct nvc0_context *nvc0, struct nvc0_query *q)
nvc0_hw_query_get(push, q, 0xc0 + 0x70, 0x0980a002); /* ROP, PIXELS */
nvc0_hw_query_get(push, q, 0xc0 + 0x80, 0x0d808002); /* TCP, LAUNCHES */
nvc0_hw_query_get(push, q, 0xc0 + 0x90, 0x0e809002); /* TEP, LAUNCHES */
+ ((uint64_t *)hq->data)[(12 + 10) * 2] = 0;
break;
default:
break;
@@ -271,6 +271,7 @@ nvc0_hw_end_query(struct nvc0_context *nvc0, struct nvc0_query *q)
nvc0_hw_query_get(push, q, 0x70, 0x0980a002); /* ROP, PIXELS */
nvc0_hw_query_get(push, q, 0x80, 0x0d808002); /* TCP, LAUNCHES */
nvc0_hw_query_get(push, q, 0x90, 0x0e809002); /* TEP, LAUNCHES */
+ ((uint64_t *)hq->data)[10 * 2] = 0;
break;
case PIPE_QUERY_TIMESTAMP_DISJOINT:
/* This query is not issued on GPU because disjoint is forced to false */
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h
index 8225755d85e..5c8ad5eb2d0 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h
@@ -6,6 +6,11 @@
#include "nvc0_query.h"
+#define NVC0_HW_QUERY_STATE_READY 0
+#define NVC0_HW_QUERY_STATE_ACTIVE 1
+#define NVC0_HW_QUERY_STATE_ENDED 2
+#define NVC0_HW_QUERY_STATE_FLUSHED 3
+
#define NVC0_HW_QUERY_TFB_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0)
struct nvc0_hw_query;
@@ -29,7 +34,6 @@ struct nvc0_hw_query {
uint8_t state;
boolean is64bit;
uint8_t rotate;
- int nesting; /* only used for occlusion queries */
struct nouveau_mm_allocation *mm;
struct nouveau_fence *fence;
};
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 68b5869276a..553fe324bc7 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -182,6 +182,13 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
return class_3d >= GM200_3D_CLASS ? 8 : 0;
case PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET:
return 64 * 1024 * 1024;
+ case PIPE_CAP_MAX_VARYINGS:
+ /* NOTE: These only count our slots for GENERIC varyings.
+ * The address space may be larger, but the actual hard limit seems to be
+ * less than what the address space layout permits, so don't add TEXCOORD,
+ * COLOR, etc. here.
+ */
+ return 0x1f0 / 16;
/* supported caps */
case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
@@ -266,6 +273,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX:
case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION:
case PIPE_CAP_QUERY_SO_OVERFLOW:
+ case PIPE_CAP_DEST_SURFACE_SRGB_CONTROL:
return 1;
case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
return nouveau_screen(pscreen)->vram_domain & NOUVEAU_BO_VRAM ? 1 : 0;
@@ -336,6 +344,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
case PIPE_CAP_SURFACE_SAMPLE_COUNT:
case PIPE_CAP_QUERY_PIPELINE_STATISTICS_SINGLE:
case PIPE_CAP_RGB_OVERRIDE_DST_ALPHA_BLEND:
+ case PIPE_CAP_GLSL_TESS_LEVELS_AS_INPUTS:
return 0;
case PIPE_CAP_VENDOR_ID:
@@ -392,18 +401,6 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen,
case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH:
return 16;
case PIPE_SHADER_CAP_MAX_INPUTS:
- if (shader == PIPE_SHADER_VERTEX)
- return 32;
- /* NOTE: These only count our slots for GENERIC varyings.
- * The address space may be larger, but the actual hard limit seems to be
- * less than what the address space layout permits, so don't add TEXCOORD,
- * COLOR, etc. here.
- */
- if (shader == PIPE_SHADER_FRAGMENT)
- return 0x1f0 / 16;
- /* Actually this counts CLIPVERTEX, which occupies the last generic slot,
- * and excludes 0x60 per-patch inputs.
- */
return 0x200 / 16;
case PIPE_SHADER_CAP_MAX_OUTPUTS:
return 32;
@@ -1286,8 +1283,8 @@ nvc0_screen_create(struct nouveau_device *dev)
for (i = 0; i < NVC0_MAX_VIEWPORTS; i++) {
BEGIN_NVC0(push, NVC0_3D(SCISSOR_ENABLE(i)), 3);
PUSH_DATA (push, 1);
- PUSH_DATA (push, 8192 << 16);
- PUSH_DATA (push, 8192 << 16);
+ PUSH_DATA (push, 16384 << 16);
+ PUSH_DATA (push, 16384 << 16);
}
#define MK_MACRO(m, n) i = nvc0_graph_set_macro(screen, m, i, sizeof(n), n);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
index 04f0a0d55da..8820b5aac66 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
@@ -1051,21 +1051,13 @@ nve4_set_surface_info(struct nouveau_pushbuf *push,
} else {
struct nv50_miptree *mt = nv50_miptree(&res->base);
struct nv50_miptree_level *lvl = &mt->level[view->u.tex.level];
- const unsigned z = view->u.tex.first_layer;
-
- if (z) {
- if (mt->layout_3d) {
- address += nvc0_mt_zslice_offset(mt, view->u.tex.level, z);
- /* doesn't work if z passes z-tile boundary */
- if (depth > 1) {
- pipe_debug_message(&nvc0->base.debug, CONFORMANCE,
- "3D images are not really supported!");
- debug_printf("3D images are not really supported!\n");
- }
- } else {
- address += mt->layer_stride * z;
- }
+ unsigned z = view->u.tex.first_layer;
+
+ if (!mt->layout_3d) {
+ address += mt->layer_stride * z;
+ z = 0;
}
+
address += lvl->offset;
info[0] = address >> 8;
@@ -1080,7 +1072,8 @@ nve4_set_surface_info(struct nouveau_pushbuf *push,
info[6] = depth - 1;
info[6] |= (lvl->tile_mode & 0xf00) << 21;
info[6] |= NVC0_TILE_SHIFT_Z(lvl->tile_mode) << 22;
- info[7] = 0;
+ info[7] = mt->layout_3d ? 1 : 0;
+ info[7] |= z << 16;
info[14] = mt->ms_x;
info[15] = mt->ms_y;
}
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
index 3fbe7614e52..7d6be9382d1 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
@@ -1040,7 +1040,10 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
}
if (nvc0->state.vbo_mode) {
- nvc0_push_vbo(nvc0, info);
+ if (info->indirect)
+ nvc0_push_vbo_indirect(nvc0, info);
+ else
+ nvc0_push_vbo(nvc0, info);
goto cleanup;
}
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c
index 256e20df2e4..4333fb26d23 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c
@@ -466,6 +466,83 @@ nvc0_prim_gl(unsigned prim)
}
}
+typedef struct {
+ uint32_t count;
+ uint32_t primCount;
+ uint32_t first;
+ uint32_t baseInstance;
+} DrawArraysIndirectCommand;
+
+typedef struct {
+ uint32_t count;
+ uint32_t primCount;
+ uint32_t firstIndex;
+ int32_t baseVertex;
+ uint32_t baseInstance;
+} DrawElementsIndirectCommand;
+
+void
+nvc0_push_vbo_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
+{
+ /* The strategy here is to just read the commands from the indirect buffer
+ * and do the draws. This is suboptimal, but will only happen in the case
+ * that conversion is required for FIXED or DOUBLE inputs.
+ */
+ struct nvc0_screen *screen = nvc0->screen;
+ struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+ struct nv04_resource *buf = nv04_resource(info->indirect->buffer);
+ struct nv04_resource *buf_count = nv04_resource(info->indirect->indirect_draw_count);
+ unsigned i;
+
+ unsigned draw_count = info->indirect->draw_count;
+ if (buf_count) {
+ uint32_t *count = nouveau_resource_map_offset(
+ &nvc0->base, buf_count, info->indirect->indirect_draw_count_offset,
+ NOUVEAU_BO_RD);
+ draw_count = *count;
+ }
+
+ uint8_t *buf_data = nouveau_resource_map_offset(
+ &nvc0->base, buf, info->indirect->offset, NOUVEAU_BO_RD);
+ struct pipe_draw_info single = *info;
+ single.indirect = NULL;
+ for (i = 0; i < draw_count; i++, buf_data += info->indirect->stride) {
+ if (info->index_size) {
+ DrawElementsIndirectCommand *cmd = (void *)buf_data;
+ single.start = info->start + cmd->firstIndex;
+ single.count = cmd->count;
+ single.start_instance = cmd->baseInstance;
+ single.instance_count = cmd->primCount;
+ single.index_bias = cmd->baseVertex;
+ } else {
+ DrawArraysIndirectCommand *cmd = (void *)buf_data;
+ single.start = cmd->first;
+ single.count = cmd->count;
+ single.start_instance = cmd->baseInstance;
+ single.instance_count = cmd->primCount;
+ }
+
+ if (nvc0->vertprog->vp.need_draw_parameters) {
+ PUSH_SPACE(push, 9);
+ BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
+ PUSH_DATA (push, NVC0_CB_AUX_SIZE);
+ PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0));
+ PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0));
+ BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 3);
+ PUSH_DATA (push, NVC0_CB_AUX_DRAW_INFO);
+ PUSH_DATA (push, single.index_bias);
+ PUSH_DATA (push, single.start_instance);
+ PUSH_DATA (push, single.drawid + i);
+ }
+
+ nvc0_push_vbo(nvc0, &single);
+ }
+
+ nouveau_resource_unmap(buf);
+ if (buf_count)
+ nouveau_resource_unmap(buf_count);
+}
+
void
nvc0_push_vbo(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
{
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index 19d3a1bae30..be0b475e5ef 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -304,6 +304,9 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
return 2048;
+ case PIPE_CAP_MAX_VARYINGS:
+ return 10;
+
case PIPE_CAP_VENDOR_ID:
return 0x1002;
case PIPE_CAP_DEVICE_ID:
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index ade1a94ab32..41a878ab9d2 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -536,6 +536,9 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
case PIPE_CAP_MAX_TEXEL_OFFSET:
return 7;
+ case PIPE_CAP_MAX_VARYINGS:
+ return 32;
+
case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK:
return PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_R600;
case PIPE_CAP_ENDIANNESS:
diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c
index f86764f5220..96ffbf82927 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -1837,18 +1837,17 @@ static void r600_emit_sampler_states(struct r600_context *rctx,
/* TEX_ARRAY_OVERRIDE must be set for array textures to disable
* filtering between layers.
- * Don't update TEX_ARRAY_OVERRIDE if we don't have the sampler view.
*/
- if (rview) {
- enum pipe_texture_target target = rview->base.texture->target;
- if (target == PIPE_TEXTURE_1D_ARRAY ||
- target == PIPE_TEXTURE_2D_ARRAY) {
- rstate->tex_sampler_words[0] |= S_03C000_TEX_ARRAY_OVERRIDE(1);
- texinfo->is_array_sampler[i] = true;
- } else {
- rstate->tex_sampler_words[0] &= C_03C000_TEX_ARRAY_OVERRIDE;
- texinfo->is_array_sampler[i] = false;
- }
+ enum pipe_texture_target target = PIPE_BUFFER;
+ if (rview)
+ target = rview->base.texture->target;
+ if (target == PIPE_TEXTURE_1D_ARRAY ||
+ target == PIPE_TEXTURE_2D_ARRAY) {
+ rstate->tex_sampler_words[0] |= S_03C000_TEX_ARRAY_OVERRIDE(1);
+ texinfo->is_array_sampler[i] = true;
+ } else {
+ rstate->tex_sampler_words[0] &= C_03C000_TEX_ARRAY_OVERRIDE;
+ texinfo->is_array_sampler[i] = false;
}
radeon_emit(cs, PKT3(PKT3_SET_SAMPLER, 3, 0));
diff --git a/src/gallium/drivers/radeon/radeon_vcn_dec.c b/src/gallium/drivers/radeon/radeon_vcn_dec.c
index a4e6d9dc6b5..688cef90103 100644
--- a/src/gallium/drivers/radeon/radeon_vcn_dec.c
+++ b/src/gallium/drivers/radeon/radeon_vcn_dec.c
@@ -64,6 +64,7 @@ static rvcn_dec_message_avc_t get_h264_msg(struct radeon_decoder *dec,
memset(&result, 0, sizeof(result));
switch (pic->base.profile) {
case PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE:
+ case PIPE_VIDEO_PROFILE_MPEG4_AVC_CONSTRAINED_BASELINE:
result.profile = RDECODE_H264_PROFILE_BASELINE;
break;
@@ -490,7 +491,7 @@ static rvcn_dec_message_vp9_t get_vp9_msg(struct radeon_decoder *dec,
assert(dec->base.max_references + 1 <= 16);
- for (i = 0 ; i < dec->base.max_references + 1 ; ++i) {
+ for (i = 0 ; i < 16 ; ++i) {
if (dec->render_pic_list[i] && dec->render_pic_list[i] == target) {
result.curr_pic_idx =
(uintptr_t)vl_video_buffer_get_associated_data(target, &dec->base);
diff --git a/src/gallium/drivers/radeonsi/Makefile.sources b/src/gallium/drivers/radeonsi/Makefile.sources
index 713629c6e87..3cdd0851a5c 100644
--- a/src/gallium/drivers/radeonsi/Makefile.sources
+++ b/src/gallium/drivers/radeonsi/Makefile.sources
@@ -14,6 +14,7 @@ C_SOURCES := \
si_compute_blit.c \
si_cp_dma.c \
si_debug.c \
+ si_debug_options.h \
si_descriptors.c \
si_dma.c \
si_dma_cs.c \
diff --git a/src/gallium/drivers/radeonsi/driinfo_radeonsi.h b/src/gallium/drivers/radeonsi/driinfo_radeonsi.h
index cbf3bb01fb3..000a300746e 100644
--- a/src/gallium/drivers/radeonsi/driinfo_radeonsi.h
+++ b/src/gallium/drivers/radeonsi/driinfo_radeonsi.h
@@ -11,5 +11,14 @@ DRI_CONF_SECTION_PERFORMANCE
DRI_CONF_SECTION_END
DRI_CONF_SECTION_DEBUG
- DRI_CONF_RADEONSI_CLEAR_DB_CACHE_BEFORE_CLEAR("false")
+
+//= BEGIN VERBATIM
+#define OPT_BOOL(name, dflt, description) \
+ DRI_CONF_OPT_BEGIN_B(radeonsi_##name, #dflt) \
+ DRI_CONF_DESC(en, description) \
+ DRI_CONF_OPT_END
+
+#include "radeonsi/si_debug_options.h"
+//= END VERBATIM
+
DRI_CONF_SECTION_END
diff --git a/src/gallium/drivers/radeonsi/si_buffer.c b/src/gallium/drivers/radeonsi/si_buffer.c
index 03c11cb7013..3845e56a4b3 100644
--- a/src/gallium/drivers/radeonsi/si_buffer.c
+++ b/src/gallium/drivers/radeonsi/si_buffer.c
@@ -521,10 +521,13 @@ static void si_buffer_do_flush_region(struct pipe_context *ctx,
struct si_resource *buf = si_resource(transfer->resource);
if (stransfer->staging) {
+ unsigned src_offset = stransfer->offset +
+ transfer->box.x % SI_MAP_BUFFER_ALIGNMENT +
+ (box->x - transfer->box.x);
+
/* Copy the staging buffer into the original one. */
si_copy_buffer((struct si_context*)ctx, transfer->resource,
- &stransfer->staging->b.b, box->x,
- stransfer->offset + box->x % SI_MAP_BUFFER_ALIGNMENT,
+ &stransfer->staging->b.b, box->x, src_offset,
box->width);
}
diff --git a/src/gallium/drivers/radeonsi/si_clear.c b/src/gallium/drivers/radeonsi/si_clear.c
index 9026f61dc0a..ef25c79fa9c 100644
--- a/src/gallium/drivers/radeonsi/si_clear.c
+++ b/src/gallium/drivers/radeonsi/si_clear.c
@@ -272,7 +272,7 @@ void vi_dcc_clear_level(struct si_context *sctx,
}
si_clear_buffer(sctx, dcc_buffer, dcc_offset, clear_size,
- &clear_value, 4, SI_COHERENCY_CB_META);
+ &clear_value, 4, SI_COHERENCY_CB_META, false);
}
/* Set the same micro tile mode as the destination of the last MSAA resolve.
@@ -505,7 +505,7 @@ static void si_do_fast_color_clear(struct si_context *sctx,
uint32_t clear_value = 0xCCCCCCCC;
si_clear_buffer(sctx, &tex->cmask_buffer->b.b,
tex->cmask_offset, tex->surface.cmask_size,
- &clear_value, 4, SI_COHERENCY_CB_META);
+ &clear_value, 4, SI_COHERENCY_CB_META, false);
fmask_decompress_needed = true;
}
@@ -533,7 +533,7 @@ static void si_do_fast_color_clear(struct si_context *sctx,
uint32_t clear_value = 0;
si_clear_buffer(sctx, &tex->cmask_buffer->b.b,
tex->cmask_offset, tex->surface.cmask_size,
- &clear_value, 4, SI_COHERENCY_CB_META);
+ &clear_value, 4, SI_COHERENCY_CB_META, false);
eliminate_needed = true;
}
@@ -647,7 +647,7 @@ static void si_clear(struct pipe_context *ctx, unsigned buffers,
* This hack decreases back-to-back ClearDepth performance.
*/
if ((sctx->db_depth_clear || sctx->db_stencil_clear) &&
- sctx->screen->clear_db_cache_before_clear)
+ sctx->screen->options.clear_db_cache_before_clear)
sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB;
}
diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c
index 38c48c30be9..304296c4a52 100644
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@@ -177,7 +177,8 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx,
void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
uint64_t offset, uint64_t size, uint32_t *clear_value,
- uint32_t clear_value_size, enum si_coherency coher)
+ uint32_t clear_value_size, enum si_coherency coher,
+ bool force_cpdma)
{
if (!size)
return;
@@ -241,7 +242,8 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
* about buffer placements.
*/
if (clear_value_size > 4 ||
- (clear_value_size == 4 &&
+ (!force_cpdma &&
+ clear_value_size == 4 &&
offset % 4 == 0 &&
(size > 32*1024 || sctx->chip_class <= VI))) {
si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0,
@@ -282,7 +284,7 @@ static void si_pipe_clear_buffer(struct pipe_context *ctx,
coher = SI_COHERENCY_SHADER;
si_clear_buffer((struct si_context*)ctx, dst, offset, size, (uint32_t*)clear_value,
- clear_value_size, coher);
+ clear_value_size, coher, false);
}
void si_copy_buffer(struct si_context *sctx,
diff --git a/src/gallium/drivers/radeonsi/si_debug_options.h b/src/gallium/drivers/radeonsi/si_debug_options.h
new file mode 100644
index 00000000000..165dba8baf5
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/si_debug_options.h
@@ -0,0 +1,4 @@
+OPT_BOOL(clear_db_cache_before_clear, false, "Clear DB cache before fast depth clear")
+OPT_BOOL(enable_nir, false, "Enable NIR")
+
+#undef OPT_BOOL
diff --git a/src/gallium/drivers/radeonsi/si_get.c b/src/gallium/drivers/radeonsi/si_get.c
index bb2d8c09eb1..ff25a976e77 100644
--- a/src/gallium/drivers/radeonsi/si_get.c
+++ b/src/gallium/drivers/radeonsi/si_get.c
@@ -208,7 +208,7 @@ static int si_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
RADEON_SPARSE_PAGE_SIZE : 0;
case PIPE_CAP_PACKED_UNIFORMS:
- if (sscreen->debug_flags & DBG(NIR))
+ if (sscreen->options.enable_nir)
return 1;
return 0;
@@ -254,6 +254,9 @@ static int si_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
return 30;
+ case PIPE_CAP_MAX_VARYINGS:
+ return 32;
+
case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK:
return sscreen->info.chip_class <= VI ?
PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_R600 : 0;
@@ -420,11 +423,11 @@ static int si_get_shader_param(struct pipe_screen* pscreen,
case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
return SI_NUM_IMAGES;
case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
- if (sscreen->debug_flags & DBG(NIR))
+ if (sscreen->options.enable_nir)
return 0;
return 32;
case PIPE_SHADER_CAP_PREFERRED_IR:
- if (sscreen->debug_flags & DBG(NIR))
+ if (sscreen->options.enable_nir)
return PIPE_SHADER_IR_NIR;
return PIPE_SHADER_IR_TGSI;
case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
diff --git a/src/gallium/drivers/radeonsi/si_perfcounter.c b/src/gallium/drivers/radeonsi/si_perfcounter.c
index 2da14f8868f..d55394f2cba 100644
--- a/src/gallium/drivers/radeonsi/si_perfcounter.c
+++ b/src/gallium/drivers/radeonsi/si_perfcounter.c
@@ -1333,7 +1333,7 @@ void si_init_perfcounters(struct si_screen *screen)
for (i = 0; i < num_blocks; ++i) {
struct si_pc_block *block = &pc->blocks[i];
block->b = &blocks[i];
- block->num_instances = block->b->instances;
+ block->num_instances = MAX2(1, block->b->instances);
if (!strcmp(block->b->b->name, "CB") ||
!strcmp(block->b->b->name, "DB"))
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 41d395d7d3f..507ca65605f 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -61,7 +61,6 @@ static const struct debug_named_value debug_options[] = {
/* Shader compiler options (with no effect on the shader cache): */
{ "checkir", DBG(CHECK_IR), "Enable additional sanity checks on shader IR" },
- { "nir", DBG(NIR), "Enable experimental NIR shaders" },
{ "mono", DBG(MONOLITHIC_SHADERS), "Use old-style monolithic shaders compiled on demand" },
{ "nooptvariant", DBG(NO_OPT_VARIANT), "Disable compiling optimized shader variants." },
@@ -609,11 +608,14 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
si_begin_new_gfx_cs(sctx);
if (sctx->chip_class == CIK) {
- /* Clear the NULL constant buffer, because loads should return zeros. */
+ /* Clear the NULL constant buffer, because loads should return zeros.
+ * Note that this forces CP DMA to be used, because clover deadlocks
+ * for some reason when the compute codepath is used.
+ */
uint32_t clear_value = 0;
si_clear_buffer(sctx, sctx->null_const_buf.buffer, 0,
sctx->null_const_buf.buffer->width0,
- &clear_value, 4, SI_COHERENCY_SHADER);
+ &clear_value, 4, SI_COHERENCY_SHADER, true);
}
return &sctx->b;
fail:
@@ -804,8 +806,7 @@ static void si_disk_cache_create(struct si_screen *sscreen)
#define ALL_FLAGS (DBG(FS_CORRECT_DERIVS_AFTER_KILL) | \
DBG(SI_SCHED) | \
DBG(GISEL) | \
- DBG(UNSAFE_MATH) | \
- DBG(NIR))
+ DBG(UNSAFE_MATH))
uint64_t shader_debug_flags = sscreen->debug_flags &
ALL_FLAGS;
@@ -813,7 +814,11 @@ static void si_disk_cache_create(struct si_screen *sscreen)
* how 32-bit addresses are expanded to 64 bits.
*/
STATIC_ASSERT(ALL_FLAGS <= UINT_MAX);
- shader_debug_flags |= (uint64_t)sscreen->info.address32_hi << 32;
+ assert((int16_t)sscreen->info.address32_hi == (int32_t)sscreen->info.address32_hi);
+ shader_debug_flags |= (uint64_t)(sscreen->info.address32_hi & 0xffff) << 32;
+
+ if (sscreen->options.enable_nir)
+ shader_debug_flags |= 1ull << 48;
sscreen->disk_shader_cache =
disk_cache_create(sscreen->info.name,
@@ -866,7 +871,6 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws,
if (driQueryOptionb(config->options, "radeonsi_enable_sisched"))
sscreen->debug_flags |= DBG(SI_SCHED);
-
if (sscreen->debug_flags & DBG(INFO))
ac_print_gpu_info(&sscreen->info);
@@ -1013,8 +1017,16 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws,
driQueryOptionb(config->options, "radeonsi_assume_no_z_fights");
sscreen->commutative_blend_add =
driQueryOptionb(config->options, "radeonsi_commutative_blend_add");
- sscreen->clear_db_cache_before_clear =
- driQueryOptionb(config->options, "radeonsi_clear_db_cache_before_clear");
+
+ {
+#define OPT_BOOL(name, dflt, description) \
+ sscreen->options.name = \
+ driQueryOptionb(config->options, "radeonsi_"#name);
+#include "si_debug_options.h"
+ }
+
+ sscreen->has_gfx9_scissor_bug = sscreen->info.family == CHIP_VEGA10 ||
+ sscreen->info.family == CHIP_RAVEN;
sscreen->has_msaa_sample_loc_bug = (sscreen->info.family >= CHIP_POLARIS10 &&
sscreen->info.family <= CHIP_POLARIS12) ||
sscreen->info.family == CHIP_VEGA10 ||
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index eb3ba951dae..ea009622970 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -133,7 +133,6 @@ enum {
/* Shader compiler options (with no effect on the shader cache): */
DBG_CHECK_IR,
- DBG_NIR,
DBG_MONOLITHIC_SHADERS,
DBG_NO_OPT_VARIANT,
@@ -445,7 +444,7 @@ struct si_screen {
bool has_out_of_order_rast;
bool assume_no_z_fights;
bool commutative_blend_add;
- bool clear_db_cache_before_clear;
+ bool has_gfx9_scissor_bug;
bool has_msaa_sample_loc_bug;
bool has_ls_vgpr_init_bug;
bool has_dcc_constant_encode;
@@ -453,6 +452,11 @@ struct si_screen {
bool dfsm_allowed;
bool llvm_has_working_vgpr_indexing;
+ struct {
+#define OPT_BOOL(name, dflt, description) bool name:1;
+#include "si_debug_options.h"
+ } options;
+
/* Whether shaders are monolithic (1-part) or separate (3-part). */
bool use_monolithic_shaders;
bool record_llvm_ir;
@@ -1054,7 +1058,7 @@ struct si_context {
unsigned num_resident_handles;
uint64_t num_alloc_tex_transfer_bytes;
unsigned last_tex_ps_draw_ratio; /* for query */
- unsigned context_roll_counter;
+ unsigned context_roll;
/* Queries. */
/* Maintain the list of active queries for pausing between IBs. */
@@ -1168,7 +1172,8 @@ unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
enum si_cache_policy cache_policy);
void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
uint64_t offset, uint64_t size, uint32_t *clear_value,
- uint32_t clear_value_size, enum si_coherency coher);
+ uint32_t clear_value_size, enum si_coherency coher,
+ bool force_cpdma);
void si_copy_buffer(struct si_context *sctx,
struct pipe_resource *dst, struct pipe_resource *src,
uint64_t dst_offset, uint64_t src_offset, unsigned size);
diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c
index 266b9d3ce84..280eee3a280 100644
--- a/src/gallium/drivers/radeonsi/si_query.c
+++ b/src/gallium/drivers/radeonsi/si_query.c
@@ -549,11 +549,15 @@ void si_query_buffer_reset(struct si_context *sctx, struct si_query_buffer *buff
}
buffer->results_end = 0;
+ if (!buffer->buf)
+ return;
+
/* Discard even the oldest buffer if it can't be mapped without a stall. */
- if (buffer->buf &&
- (si_rings_is_buffer_referenced(sctx, buffer->buf->buf, RADEON_USAGE_READWRITE) ||
- !sctx->ws->buffer_wait(buffer->buf->buf, 0, RADEON_USAGE_READWRITE))) {
+ if (si_rings_is_buffer_referenced(sctx, buffer->buf->buf, RADEON_USAGE_READWRITE) ||
+ !sctx->ws->buffer_wait(buffer->buf->buf, 0, RADEON_USAGE_READWRITE)) {
si_resource_reference(&buffer->buf, NULL);
+ } else {
+ buffer->unprepared = true;
}
}
@@ -561,29 +565,31 @@ bool si_query_buffer_alloc(struct si_context *sctx, struct si_query_buffer *buff
bool (*prepare_buffer)(struct si_context *, struct si_query_buffer*),
unsigned size)
{
- if (buffer->buf && buffer->results_end + size >= buffer->buf->b.b.width0)
- return true;
+ bool unprepared = buffer->unprepared;
+ buffer->unprepared = false;
+
+ if (!buffer->buf || buffer->results_end + size > buffer->buf->b.b.width0) {
+ if (buffer->buf) {
+ struct si_query_buffer *qbuf = MALLOC_STRUCT(si_query_buffer);
+ memcpy(qbuf, buffer, sizeof(*qbuf));
+ buffer->previous = qbuf;
+ }
+ buffer->results_end = 0;
- if (buffer->buf) {
- struct si_query_buffer *qbuf = MALLOC_STRUCT(si_query_buffer);
- memcpy(qbuf, buffer, sizeof(*qbuf));
- buffer->previous = qbuf;
+ /* Queries are normally read by the CPU after
+ * being written by the gpu, hence staging is probably a good
+ * usage pattern.
+ */
+ struct si_screen *screen = sctx->screen;
+ unsigned buf_size = MAX2(size, screen->info.min_alloc_size);
+ buffer->buf = si_resource(
+ pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
+ if (unlikely(!buffer->buf))
+ return false;
+ unprepared = true;
}
- buffer->results_end = 0;
-
- /* Queries are normally read by the CPU after
- * being written by the gpu, hence staging is probably a good
- * usage pattern.
- */
- struct si_screen *screen = sctx->screen;
- unsigned buf_size = MAX2(size, screen->info.min_alloc_size);
- buffer->buf = si_resource(
- pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
- if (unlikely(!buffer->buf))
- return false;
-
- if (prepare_buffer) {
+ if (unprepared && prepare_buffer) {
if (unlikely(!prepare_buffer(sctx, buffer))) {
si_resource_reference(&buffer->buf, NULL);
return false;
diff --git a/src/gallium/drivers/radeonsi/si_query.h b/src/gallium/drivers/radeonsi/si_query.h
index aaf0bd03aca..c61af51d57c 100644
--- a/src/gallium/drivers/radeonsi/si_query.h
+++ b/src/gallium/drivers/radeonsi/si_query.h
@@ -177,12 +177,13 @@ struct si_query_hw_ops {
struct si_query_buffer {
/* The buffer where query results are stored. */
struct si_resource *buf;
- /* Offset of the next free result after current query data */
- unsigned results_end;
/* If a query buffer is full, a new buffer is created and the old one
* is put in here. When we calculate the result, we sum up the samples
* from all buffers. */
struct si_query_buffer *previous;
+ /* Offset of the next free result after current query data */
+ unsigned results_end;
+ bool unprepared;
};
void si_query_buffer_destroy(struct si_screen *sctx, struct si_query_buffer *buffer);
diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c
index 7554f5b9f8b..d7618b46eb0 100644
--- a/src/gallium/drivers/radeonsi/si_shader_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shader_nir.c
@@ -834,7 +834,7 @@ si_lower_nir(struct si_shader_selector* sel)
NIR_PASS(progress, sel->nir, nir_opt_if);
NIR_PASS(progress, sel->nir, nir_opt_dead_cf);
NIR_PASS(progress, sel->nir, nir_opt_cse);
- NIR_PASS(progress, sel->nir, nir_opt_peephole_select, 8, true, true);
+ NIR_PASS(progress, sel->nir, nir_opt_peephole_select, 8, true);
/* Needed for algebraic lowering */
NIR_PASS(progress, sel->nir, nir_opt_algebraic);
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 89d81c97e18..85103a614b1 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -256,7 +256,7 @@ static void si_emit_cb_render_state(struct si_context *sctx)
sx_blend_opt_control);
}
if (initial_cdw != cs->current.cdw)
- sctx->context_roll_counter++;
+ sctx->context_roll = true;
}
/*
@@ -792,7 +792,7 @@ static void si_emit_clip_regs(struct si_context *sctx)
S_028810_CLIP_DISABLE(window_space));
if (initial_cdw != sctx->gfx_cs->current.cdw)
- sctx->context_roll_counter++;
+ sctx->context_roll = true;
}
/*
@@ -1446,7 +1446,7 @@ static void si_emit_db_render_state(struct si_context *sctx)
SI_TRACKED_DB_SHADER_CONTROL, db_shader_control);
if (initial_cdw != sctx->gfx_cs->current.cdw)
- sctx->context_roll_counter++;
+ sctx->context_roll = true;
}
/*
@@ -3527,7 +3527,7 @@ static void si_emit_msaa_config(struct si_context *sctx)
SI_TRACKED_PA_SC_MODE_CNTL_1, sc_mode_cntl_1);
if (initial_cdw != cs->current.cdw) {
- sctx->context_roll_counter++;
+ sctx->context_roll = true;
/* GFX9: Flush DFSM when the AA mode changes. */
if (sctx->screen->dfsm_allowed) {
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 767e789276a..344f45e7e43 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -224,7 +224,8 @@ static inline unsigned si_atoms_that_always_roll_context(void)
SI_ATOM_BIT(scissors) |
SI_ATOM_BIT(viewports) |
SI_ATOM_BIT(stencil_ref) |
- SI_ATOM_BIT(scratch_state));
+ SI_ATOM_BIT(scratch_state) |
+ SI_ATOM_BIT(window_rectangles));
}
struct si_shader_data {
diff --git a/src/gallium/drivers/radeonsi/si_state_binning.c b/src/gallium/drivers/radeonsi/si_state_binning.c
index 3516e561282..5c6c2e69b90 100644
--- a/src/gallium/drivers/radeonsi/si_state_binning.c
+++ b/src/gallium/drivers/radeonsi/si_state_binning.c
@@ -321,7 +321,7 @@ static void si_emit_dpbb_disable(struct si_context *sctx)
S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF) |
S_028060_POPS_DRAIN_PS_ON_OVERLAP(1));
if (initial_cdw != sctx->gfx_cs->current.cdw)
- sctx->context_roll_counter++;
+ sctx->context_roll = true;
}
void si_emit_dpbb_state(struct si_context *sctx)
@@ -443,5 +443,5 @@ void si_emit_dpbb_state(struct si_context *sctx)
S_028060_PUNCHOUT_MODE(punchout_mode) |
S_028060_POPS_DRAIN_PS_ON_OVERLAP(1));
if (initial_cdw != sctx->gfx_cs->current.cdw)
- sctx->context_roll_counter++;
+ sctx->context_roll = true;
}
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index c7c02d20d15..7bf82b8b05b 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -66,7 +66,7 @@ static unsigned si_conv_pipe_prim(unsigned mode)
* The information about LDS and other non-compile-time parameters is then
* written to userdata SGPRs.
*/
-static bool si_emit_derived_tess_state(struct si_context *sctx,
+static void si_emit_derived_tess_state(struct si_context *sctx,
const struct pipe_draw_info *info,
unsigned *num_patches)
{
@@ -110,7 +110,7 @@ static bool si_emit_derived_tess_state(struct si_context *sctx,
(!has_primid_instancing_bug ||
(sctx->last_tess_uses_primid == tess_uses_primid))) {
*num_patches = sctx->last_num_patches;
- return false;
+ return;
}
sctx->last_ls = ls_current;
@@ -305,9 +305,8 @@ static bool si_emit_derived_tess_state(struct si_context *sctx,
ls_hs_config);
}
sctx->last_ls_hs_config = ls_hs_config;
- return true; /* true if the context rolls */
+ sctx->context_roll = true;
}
- return false;
}
static unsigned si_num_prims_for_vertices(const struct pipe_draw_info *info)
@@ -541,7 +540,7 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
}
/* rast_prim is the primitive type after GS. */
-static bool si_emit_rasterizer_prim_state(struct si_context *sctx)
+static void si_emit_rasterizer_prim_state(struct si_context *sctx)
{
struct radeon_cmdbuf *cs = sctx->gfx_cs;
enum pipe_prim_type rast_prim = sctx->current_rast_prim;
@@ -549,11 +548,11 @@ static bool si_emit_rasterizer_prim_state(struct si_context *sctx)
/* Skip this if not rendering lines. */
if (!util_prim_is_lines(rast_prim))
- return false;
+ return;
if (rast_prim == sctx->last_rast_prim &&
rs->pa_sc_line_stipple == sctx->last_sc_line_stipple)
- return false;
+ return;
/* For lines, reset the stipple pattern at each primitive. Otherwise,
* reset the stipple pattern at each packet (line strips, line loops).
@@ -564,7 +563,7 @@ static bool si_emit_rasterizer_prim_state(struct si_context *sctx)
sctx->last_rast_prim = rast_prim;
sctx->last_sc_line_stipple = rs->pa_sc_line_stipple;
- return true; /* true if the context rolls */
+ sctx->context_roll = true;
}
static void si_emit_vs_state(struct si_context *sctx,
@@ -659,6 +658,7 @@ static void si_emit_draw_registers(struct si_context *sctx,
radeon_set_context_reg(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX,
info->restart_index);
sctx->last_restart_index = info->restart_index;
+ sctx->context_roll = true;
}
}
@@ -896,6 +896,10 @@ static void si_emit_surface_sync(struct si_context *sctx,
radeon_emit(cs, 0); /* CP_COHER_BASE */
radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
}
+
+ /* ACQUIRE_MEM has an implicit context roll if the current context
+ * is busy. */
+ sctx->context_roll = true;
}
void si_emit_cache_flush(struct si_context *sctx)
@@ -1210,26 +1214,10 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i
unsigned skip_atom_mask)
{
unsigned num_patches = 0;
- /* Vega10/Raven scissor bug workaround. When any context register is
- * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR
- * registers must be written too.
- */
- bool handle_scissor_bug = (sctx->family == CHIP_VEGA10 || sctx->family == CHIP_RAVEN) &&
- !si_is_atom_dirty(sctx, &sctx->atoms.s.scissors);
- bool context_roll = false; /* set correctly for GFX9 only */
- context_roll |= si_emit_rasterizer_prim_state(sctx);
+ si_emit_rasterizer_prim_state(sctx);
if (sctx->tes_shader.cso)
- context_roll |= si_emit_derived_tess_state(sctx, info, &num_patches);
-
- if (handle_scissor_bug &&
- (info->count_from_stream_output ||
- sctx->dirty_atoms & si_atoms_that_always_roll_context() ||
- sctx->dirty_states & si_states_that_always_roll_context() ||
- si_prim_restart_index_changed(sctx, info)))
- context_roll = true;
-
- sctx->context_roll_counter = 0;
+ si_emit_derived_tess_state(sctx, info, &num_patches);
/* Emit state atoms. */
unsigned mask = sctx->dirty_atoms & ~skip_atom_mask;
@@ -1252,12 +1240,6 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i
}
sctx->dirty_states = 0;
- if (handle_scissor_bug &&
- (context_roll || sctx->context_roll_counter)) {
- sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
- sctx->atoms.s.scissors.emit(sctx);
- }
-
/* Emit draw states. */
si_emit_vs_state(sctx, info);
si_emit_draw_registers(sctx, info, num_patches);
@@ -1456,6 +1438,22 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
if (!si_upload_vertex_buffer_descriptors(sctx))
return;
+ /* Vega10/Raven scissor bug workaround. When any context register is
+ * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR
+ * registers must be written too.
+ */
+ bool has_gfx9_scissor_bug = sctx->screen->has_gfx9_scissor_bug;
+ unsigned masked_atoms = 0;
+
+ if (has_gfx9_scissor_bug) {
+ masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.scissors);
+
+ if (info->count_from_stream_output ||
+ sctx->dirty_atoms & si_atoms_that_always_roll_context() ||
+ sctx->dirty_states & si_states_that_always_roll_context())
+ sctx->context_roll = true;
+ }
+
/* Use optimal packet order based on whether we need to sync the pipeline. */
if (unlikely(sctx->flags & (SI_CONTEXT_FLUSH_AND_INV_CB |
SI_CONTEXT_FLUSH_AND_INV_DB |
@@ -1466,8 +1464,6 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
* Then draw and prefetch at the end. This ensures that the time
* the CUs are idle is very short.
*/
- unsigned masked_atoms = 0;
-
if (unlikely(sctx->flags & SI_CONTEXT_FLUSH_FOR_RENDER_COND))
masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.render_cond);
@@ -1481,6 +1477,13 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
if (si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond))
sctx->atoms.s.render_cond.emit(sctx);
+
+ if (has_gfx9_scissor_bug &&
+ (sctx->context_roll ||
+ si_is_atom_dirty(sctx, &sctx->atoms.s.scissors))) {
+ sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
+ sctx->atoms.s.scissors.emit(sctx);
+ }
sctx->dirty_atoms = 0;
si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);
@@ -1505,7 +1508,16 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
if (!si_upload_graphics_shader_descriptors(sctx))
return;
- si_emit_all_states(sctx, info, 0);
+ si_emit_all_states(sctx, info, masked_atoms);
+
+ if (has_gfx9_scissor_bug &&
+ (sctx->context_roll ||
+ si_is_atom_dirty(sctx, &sctx->atoms.s.scissors))) {
+ sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
+ sctx->atoms.s.scissors.emit(sctx);
+ }
+ sctx->dirty_atoms = 0;
+
si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);
/* Prefetch the remaining shaders after the draw has been
@@ -1514,6 +1526,9 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
cik_emit_prefetch_L2(sctx, false);
}
+ /* Clear the context roll flag after the draw call. */
+ sctx->context_roll = false;
+
if (unlikely(sctx->current_saved_cs)) {
si_trace_emit(sctx);
si_log_draw_state(sctx, sctx->log);
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 9e052e1efce..e76bb49dff8 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -576,7 +576,7 @@ static void si_emit_shader_es(struct si_context *sctx)
shader->vgt_vertex_reuse_block_cntl);
if (initial_cdw != sctx->gfx_cs->current.cdw)
- sctx->context_roll_counter++;
+ sctx->context_roll = true;
}
static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader)
@@ -825,7 +825,7 @@ static void si_emit_shader_gs(struct si_context *sctx)
}
if (initial_cdw != sctx->gfx_cs->current.cdw)
- sctx->context_roll_counter++;
+ sctx->context_roll = true;
}
static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
@@ -1002,7 +1002,7 @@ static void si_emit_shader_vs(struct si_context *sctx)
shader->vgt_vertex_reuse_block_cntl);
if (initial_cdw != sctx->gfx_cs->current.cdw)
- sctx->context_roll_counter++;
+ sctx->context_roll = true;
}
/**
@@ -1194,7 +1194,7 @@ static void si_emit_shader_ps(struct si_context *sctx)
shader->ctx_reg.ps.cb_shader_mask);
if (initial_cdw != sctx->gfx_cs->current.cdw)
- sctx->context_roll_counter++;
+ sctx->context_roll = true;
}
static void si_shader_ps(struct si_shader *shader)
@@ -2869,7 +2869,7 @@ static void si_emit_spi_map(struct si_context *sctx)
sctx->tracked_regs.spi_ps_input_cntl, num_interp);
if (initial_cdw != sctx->gfx_cs->current.cdw)
- sctx->context_roll_counter++;
+ sctx->context_roll = true;
}
/**
diff --git a/src/gallium/drivers/radeonsi/si_state_streamout.c b/src/gallium/drivers/radeonsi/si_state_streamout.c
index 2bf6862c89b..2a0a4bef9a2 100644
--- a/src/gallium/drivers/radeonsi/si_state_streamout.c
+++ b/src/gallium/drivers/radeonsi/si_state_streamout.c
@@ -303,6 +303,7 @@ void si_emit_streamout_end(struct si_context *sctx)
* buffer bound. This ensures that the primitives-emitted query
* won't increment. */
radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 0);
+ sctx->context_roll = true;
t[i]->buf_filled_size_valid = true;
}
diff --git a/src/gallium/drivers/radeonsi/si_state_viewport.c b/src/gallium/drivers/radeonsi/si_state_viewport.c
index dac90df1c4f..1ec69216841 100644
--- a/src/gallium/drivers/radeonsi/si_state_viewport.c
+++ b/src/gallium/drivers/radeonsi/si_state_viewport.c
@@ -185,6 +185,16 @@ static void si_emit_guardband(struct si_context *ctx)
const unsigned hw_screen_offset_alignment =
ctx->chip_class >= VI ? 16 : MAX2(ctx->screen->se_tile_repeat, 16);
+ /* Indexed by quantization modes */
+ static int max_viewport_size[] = {65535, 16383, 4095};
+
+ /* Ensure that the whole viewport stays representable in
+ * absolute coordinates.
+ * See comment in si_set_viewport_states.
+ */
+ assert(vp_as_scissor.maxx <= max_viewport_size[vp_as_scissor.quant_mode] &&
+ vp_as_scissor.maxy <= max_viewport_size[vp_as_scissor.quant_mode]);
+
hw_screen_offset_x = CLAMP(hw_screen_offset_x, 0, MAX_PA_SU_HARDWARE_SCREEN_OFFSET);
hw_screen_offset_y = CLAMP(hw_screen_offset_y, 0, MAX_PA_SU_HARDWARE_SCREEN_OFFSET);
@@ -219,7 +229,6 @@ static void si_emit_guardband(struct si_context *ctx)
*
* The viewport range is [-max_viewport_size/2, max_viewport_size/2].
*/
- static unsigned max_viewport_size[] = {65535, 16383, 4095};
assert(vp_as_scissor.quant_mode < ARRAY_SIZE(max_viewport_size));
max_range = max_viewport_size[vp_as_scissor.quant_mode] / 2;
left = (-max_range - vp.translate[0]) / vp.scale[0];
@@ -274,7 +283,7 @@ static void si_emit_guardband(struct si_context *ctx)
S_028BE4_QUANT_MODE(V_028BE4_X_16_8_FIXED_POINT_1_256TH +
vp_as_scissor.quant_mode));
if (initial_cdw != ctx->gfx_cs->current.cdw)
- ctx->context_roll_counter++;
+ ctx->context_roll = true;
}
static void si_emit_scissors(struct si_context *ctx)
@@ -333,6 +342,8 @@ static void si_set_viewport_states(struct pipe_context *pctx,
unsigned h = scissor->maxy - scissor->miny;
unsigned max_extent = MAX2(w, h);
+ int max_corner = MAX2(scissor->maxx, scissor->maxy);
+
unsigned center_x = (scissor->maxx + scissor->minx) / 2;
unsigned center_y = (scissor->maxy + scissor->miny) / 2;
unsigned max_center = MAX2(center_x, center_y);
@@ -358,7 +369,22 @@ static void si_set_viewport_states(struct pipe_context *pctx,
if (ctx->family == CHIP_RAVEN)
max_extent = 16384; /* Use QUANT_MODE == 16_8. */
- if (max_extent <= 1024) /* 4K scanline area for guardband */
+ /* Another constraint is that all coordinates in the viewport
+ * are representable in fixed point with respect to the
+ * surface origin.
+ *
+ * It means that PA_SU_HARDWARE_SCREEN_OFFSET can't be given
+ * an offset that would make the upper corner of the viewport
+ * greater than the maximum representable number post
+ * quantization, ie 2^quant_bits.
+ *
+ * This does not matter for 14.10 and 16.8 formats since the
+ * offset is already limited at 8k, but it means we can't use
+ * 12.12 if we are drawing to some pixels outside the lower
+ * 4k x 4k of the render target.
+ */
+
+ if (max_extent <= 1024 && max_corner < 4096) /* 4K scanline area for guardband */
scissor->quant_mode = SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH;
else if (max_extent <= 4096) /* 16K scanline area for guardband */
scissor->quant_mode = SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH;
diff --git a/src/gallium/drivers/radeonsi/si_test_dma.c b/src/gallium/drivers/radeonsi/si_test_dma.c
index 90a2032cd80..7e396e671be 100644
--- a/src/gallium/drivers/radeonsi/si_test_dma.c
+++ b/src/gallium/drivers/radeonsi/si_test_dma.c
@@ -309,7 +309,7 @@ void si_test_dma(struct si_screen *sscreen)
/* clear dst pixels */
uint32_t zero = 0;
si_clear_buffer(sctx, dst, 0, sdst->surface.surf_size, &zero, 4,
- SI_COHERENCY_SHADER);
+ SI_COHERENCY_SHADER, false);
memset(dst_cpu.ptr, 0, dst_cpu.layer_stride * tdst.array_size);
/* preparation */
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index 44e48cc7ee4..6931b52dc9f 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -265,6 +265,8 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
return 1;
case PIPE_CAP_CLEAR_TEXTURE:
return 1;
+ case PIPE_CAP_MAX_VARYINGS:
+ return TGSI_EXEC_MAX_INPUT_ATTRIBS;
case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
diff --git a/src/gallium/drivers/softpipe/sp_setup.c b/src/gallium/drivers/softpipe/sp_setup.c
index ffe49260b9a..a91e4f588c8 100644
--- a/src/gallium/drivers/softpipe/sp_setup.c
+++ b/src/gallium/drivers/softpipe/sp_setup.c
@@ -390,17 +390,6 @@ setup_sort_vertices(struct setup_context *setup,
return FALSE;
}
-
- /* Prepare pixel offset for rasterisation:
- * - pixel center (0.5, 0.5) for GL, or
- * - assume (0.0, 0.0) for other APIs.
- */
- if (setup->softpipe->rasterizer->half_pixel_center) {
- setup->pixel_offset = 0.5f;
- } else {
- setup->pixel_offset = 0.0f;
- }
-
return TRUE;
}
@@ -1476,6 +1465,16 @@ sp_setup_prepare(struct setup_context *setup)
}
}
+ /* Prepare pixel offset for rasterisation:
+ * - pixel center (0.5, 0.5) for GL, or
+ * - assume (0.0, 0.0) for other APIs.
+ */
+ if (setup->softpipe->rasterizer->half_pixel_center) {
+ setup->pixel_offset = 0.5f;
+ } else {
+ setup->pixel_offset = 0.0f;
+ }
+
setup->max_layer = max_layer;
sp->quad.first->begin( sp->quad.first );
diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.c b/src/gallium/drivers/softpipe/sp_tile_cache.c
index 351736ee421..998939bdf30 100644
--- a/src/gallium/drivers/softpipe/sp_tile_cache.c
+++ b/src/gallium/drivers/softpipe/sp_tile_cache.c
@@ -373,17 +373,18 @@ sp_tile_cache_flush_clear(struct softpipe_tile_cache *tc, int layer)
if (util_format_is_pure_uint(tc->surface->format)) {
pipe_put_tile_ui_format(pt, tc->transfer_map[layer],
x, y, TILE_SIZE, TILE_SIZE,
- pt->resource->format,
+ tc->surface->format,
(unsigned *) tc->tile->data.colorui128);
} else if (util_format_is_pure_sint(tc->surface->format)) {
pipe_put_tile_i_format(pt, tc->transfer_map[layer],
x, y, TILE_SIZE, TILE_SIZE,
- pt->resource->format,
+ tc->surface->format,
(int *) tc->tile->data.colori128);
} else {
- pipe_put_tile_rgba(pt, tc->transfer_map[layer],
- x, y, TILE_SIZE, TILE_SIZE,
- (float *) tc->tile->data.color);
+ pipe_put_tile_rgba_format(pt, tc->transfer_map[layer],
+ x, y, TILE_SIZE, TILE_SIZE,
+ tc->surface->format,
+ (float *) tc->tile->data.color);
}
}
numCleared++;
diff --git a/src/gallium/drivers/svga/Makefile.sources b/src/gallium/drivers/svga/Makefile.sources
index 72024cf60e1..229d2863c84 100644
--- a/src/gallium/drivers/svga/Makefile.sources
+++ b/src/gallium/drivers/svga/Makefile.sources
@@ -15,8 +15,6 @@ C_SOURCES := \
svga_hw_reg.h \
svga_link.c \
svga_link.h \
- svga_msg.c \
- svga_msg.h \
svga_mksstats.h \
svga_pipe_blend.c \
svga_pipe_blit.c \
diff --git a/src/gallium/drivers/svga/meson.build b/src/gallium/drivers/svga/meson.build
index 7981e2991f3..4d3207a9942 100644
--- a/src/gallium/drivers/svga/meson.build
+++ b/src/gallium/drivers/svga/meson.build
@@ -27,7 +27,6 @@ files_svga = files(
'svga_draw_elements.c',
'svga_format.c',
'svga_link.c',
- 'svga_msg.c',
'svga_pipe_blend.c',
'svga_pipe_blit.c',
'svga_pipe_clear.c',
diff --git a/src/gallium/drivers/svga/svga_cmd.c b/src/gallium/drivers/svga/svga_cmd.c
index 5557d208171..6577c839cf0 100644
--- a/src/gallium/drivers/svga/svga_cmd.c
+++ b/src/gallium/drivers/svga/svga_cmd.c
@@ -1693,7 +1693,7 @@ SVGA3D_BindGBSurface(struct svga_winsys_context *swc,
return PIPE_ERROR_OUT_OF_MEMORY;
swc->surface_relocation(swc, &cmd->sid, &cmd->mobid, surface,
- SVGA_RELOC_READ | SVGA_RELOC_INTERNAL);
+ SVGA_RELOC_READ);
swc->commit(swc);
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index 95dde8b0897..f747ff78bcf 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -37,7 +37,6 @@
#include "svga_public.h"
#include "svga_context.h"
#include "svga_format.h"
-#include "svga_msg.h"
#include "svga_screen.h"
#include "svga_tgsi.h"
#include "svga_resource_texture.h"
@@ -350,6 +349,8 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
return sws->have_sm4_1 ? 1 : 0; /* only single-channel textures */
+ case PIPE_CAP_MAX_VARYINGS:
+ return sws->have_vgpu10 ? VGPU10_MAX_FS_INPUTS : 10;
/* Unsupported features */
case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
@@ -889,17 +890,18 @@ svga_get_driver_query_info(struct pipe_screen *screen,
static void
init_logging(struct pipe_screen *screen)
{
+ struct svga_screen *svgascreen = svga_screen(screen);
static const char *log_prefix = "Mesa: ";
char host_log[1000];
/* Log Version to Host */
util_snprintf(host_log, sizeof(host_log) - strlen(log_prefix),
- "%s%s", log_prefix, svga_get_name(screen));
- svga_host_log(host_log);
+ "%s%s\n", log_prefix, svga_get_name(screen));
+ svgascreen->sws->host_log(svgascreen->sws, host_log);
util_snprintf(host_log, sizeof(host_log) - strlen(log_prefix),
"%s" PACKAGE_VERSION MESA_GIT_SHA1, log_prefix);
- svga_host_log(host_log);
+ svgascreen->sws->host_log(svgascreen->sws, host_log);
/* If the SVGA_EXTRA_LOGGING env var is set, log the process's command
* line (program name and arguments).
@@ -908,13 +910,23 @@ init_logging(struct pipe_screen *screen)
char cmdline[1000];
if (os_get_command_line(cmdline, sizeof(cmdline))) {
util_snprintf(host_log, sizeof(host_log) - strlen(log_prefix),
- "%s%s", log_prefix, cmdline);
- svga_host_log(host_log);
+ "%s%s\n", log_prefix, cmdline);
+ svgascreen->sws->host_log(svgascreen->sws, host_log);
}
}
}
+/**
+ * no-op logging function to use when SVGA_NO_LOGGING is set.
+ */
+static void
+nop_host_log(struct svga_winsys_screen *sws, const char *message)
+{
+ /* nothing */
+}
+
+
static void
svga_destroy_screen( struct pipe_screen *screen )
{
@@ -1132,7 +1144,11 @@ svga_screen_create(struct svga_winsys_screen *sws)
svga_screen_cache_init(svgascreen);
- init_logging(screen);
+ if (debug_get_bool_option("SVGA_NO_LOGGING", FALSE) == TRUE) {
+ svgascreen->sws->host_log = nop_host_log;
+ } else {
+ init_logging(screen);
+ }
return screen;
error2:
diff --git a/src/gallium/drivers/svga/svga_winsys.h b/src/gallium/drivers/svga/svga_winsys.h
index a067a7ba09d..14782e19a7d 100644
--- a/src/gallium/drivers/svga/svga_winsys.h
+++ b/src/gallium/drivers/svga/svga_winsys.h
@@ -753,6 +753,11 @@ struct svga_winsys_screen
void
(*stats_time_pop)();
+ /**
+ * Send a host log message
+ */
+ void
+ (*host_log)(struct svga_winsys_screen *sws, const char *message);
/** Have VGPU v10 hardware? */
boolean have_vgpu10;
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
index bd48fb2aae7..153e2af7eae 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
@@ -565,68 +565,3 @@ using Vec4 = typename SIMD_T::Vec4;
template
using Mask = typename SIMD_T::Mask;
-template
-struct SIMDVecEqual
-{
- INLINE bool operator()(Integer a, Integer b) const
- {
- Integer c = SIMD_T::xor_si(a, b);
- return SIMD_T::testz_si(c, c);
- }
-
- INLINE bool operator()(Float a, Float b) const
- {
- return this->operator()(SIMD_T::castps_si(a), SIMD_T::castps_si(b));
- }
-
- INLINE bool operator()(Double a, Double b) const
- {
- return this->operator()(SIMD_T::castpd_si(a), SIMD_T::castpd_si(b));
- }
-};
-
-template
-struct SIMDVecHash
-{
- INLINE uint32_t operator()(Integer val) const
- {
-#if defined(_WIN64) || !defined(_WIN32) // assume non-Windows is always 64-bit
- static_assert(sizeof(void*) == 8, "This path only meant for 64-bit code");
-
- uint64_t crc32 = 0;
- const uint64_t* pData = reinterpret_cast(&val);
- static const uint32_t loopIterations = sizeof(val) / sizeof(void*);
- static_assert(loopIterations * sizeof(void*) == sizeof(val), "bad vector size");
-
- for (uint32_t i = 0; i < loopIterations; ++i)
- {
- crc32 = _mm_crc32_u64(crc32, pData[i]);
- }
-
- return static_cast(crc32);
-#else
- static_assert(sizeof(void*) == 4, "This path only meant for 32-bit code");
-
- uint32_t crc32 = 0;
- const uint32_t* pData = reinterpret_cast(&val);
- static const uint32_t loopIterations = sizeof(val) / sizeof(void*);
- static_assert(loopIterations * sizeof(void*) == sizeof(val), "bad vector size");
-
- for (uint32_t i = 0; i < loopIterations; ++i)
- {
- crc32 = _mm_crc32_u32(crc32, pData[i]);
- }
-
- return crc32;
-#endif
- };
-
- INLINE uint32_t operator()(Float val) const
- {
- return operator()(SIMD_T::castps_si(val));
- };
- INLINE uint32_t operator()(Double val) const
- {
- return operator()(SIMD_T::castpd_si(val));
- }
-};
diff --git a/src/gallium/drivers/swr/swr_screen.cpp b/src/gallium/drivers/swr/swr_screen.cpp
index de9008ddf6a..ea63368f750 100644
--- a/src/gallium/drivers/swr/swr_screen.cpp
+++ b/src/gallium/drivers/swr/swr_screen.cpp
@@ -36,6 +36,7 @@
#include "util/u_cpu_detect.h"
#include "util/u_format_s3tc.h"
#include "util/u_string.h"
+#include "util/u_screen.h"
#include "state_tracker/sw_winsys.h"
@@ -369,6 +370,8 @@ swr_get_param(struct pipe_screen *screen, enum pipe_cap param)
return 32;
case PIPE_CAP_MAX_SHADER_BUFFER_SIZE:
return 1 << 27;
+ case PIPE_CAP_MAX_VARYINGS:
+ return 32;
case PIPE_CAP_VENDOR_ID:
return 0xFFFFFFFF;
@@ -385,11 +388,9 @@ swr_get_param(struct pipe_screen *screen, enum pipe_cap param)
return (int)(system_memory >> 20);
}
+ default:
+ return u_pipe_screen_get_param_defaults(screen, param);
}
-
- /* should only get here on unhandled cases */
- debug_printf("Unexpected PIPE_CAP %d query\n", param);
- return 0;
}
static int
@@ -844,7 +845,9 @@ swr_texture_layout(struct swr_screen *screen,
size_t total_size = (uint64_t)res->swr.depth * res->swr.qpitch *
res->swr.pitch * res->swr.numSamples;
- if (total_size > SWR_MAX_TEXTURE_SIZE)
+
+ // Let non-sampled textures (e.g. buffer objects) bypass the size limit
+ if (swr_resource_is_texture(&res->base) && total_size > SWR_MAX_TEXTURE_SIZE)
return false;
if (allocate) {
diff --git a/src/gallium/drivers/v3d/v3d_blit.c b/src/gallium/drivers/v3d/v3d_blit.c
index 2f36bdd46e3..d42e8fd0e69 100644
--- a/src/gallium/drivers/v3d/v3d_blit.c
+++ b/src/gallium/drivers/v3d/v3d_blit.c
@@ -491,7 +491,8 @@ v3d_tfu_blit(struct pipe_context *pctx, const struct pipe_blit_info *info)
if ((info->mask & PIPE_MASK_RGBA) == 0)
return false;
- if (info->dst.box.x != 0 ||
+ if (info->scissor_enable ||
+ info->dst.box.x != 0 ||
info->dst.box.y != 0 ||
info->dst.box.width != dst_width ||
info->dst.box.height != dst_height ||
diff --git a/src/gallium/drivers/v3d/v3d_resource.c b/src/gallium/drivers/v3d/v3d_resource.c
index 21c68942e14..84e86799d5e 100644
--- a/src/gallium/drivers/v3d/v3d_resource.c
+++ b/src/gallium/drivers/v3d/v3d_resource.c
@@ -780,7 +780,7 @@ v3d_resource_create_with_modifiers(struct pipe_screen *pscreen,
rsc->tiled = false;
} else {
fprintf(stderr, "Unsupported modifier requested\n");
- return NULL;
+ goto fail;
}
rsc->internal_format = prsc->format;
diff --git a/src/gallium/drivers/v3d/v3d_screen.c b/src/gallium/drivers/v3d/v3d_screen.c
index bed2c63a64d..17afeebb4fc 100644
--- a/src/gallium/drivers/v3d/v3d_screen.c
+++ b/src/gallium/drivers/v3d/v3d_screen.c
@@ -70,6 +70,7 @@ v3d_screen_destroy(struct pipe_screen *pscreen)
util_hash_table_destroy(screen->bo_handles);
v3d_bufmgr_destroy(pscreen);
slab_destroy_parent(&screen->transfer_pool);
+ free(screen->ro);
if (using_v3d_simulator)
v3d_simulator_destroy(screen);
@@ -177,11 +178,17 @@ v3d_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
return 4;
+ case PIPE_CAP_MAX_VARYINGS:
+ return V3D_MAX_FS_INPUTS / 4;
+
/* Texturing. */
case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
- return V3D_MAX_MIP_LEVELS;
+ if (screen->devinfo.ver < 40)
+ return 12;
+ else
+ return V3D_MAX_MIP_LEVELS;
case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
return 2048;
diff --git a/src/gallium/drivers/v3d/v3dx_draw.c b/src/gallium/drivers/v3d/v3dx_draw.c
index 2700208e388..4b1b03b5db5 100644
--- a/src/gallium/drivers/v3d/v3dx_draw.c
+++ b/src/gallium/drivers/v3d/v3dx_draw.c
@@ -55,7 +55,28 @@ v3d_start_draw(struct v3d_context *v3d)
job->submit.bcl_start = job->bcl.bo->offset;
v3d_job_add_bo(job, job->bcl.bo);
- job->tile_alloc = v3d_bo_alloc(v3d->screen, 1024 * 1024, "tile_alloc");
+ /* The PTB will request the tile alloc initial size per tile at start
+ * of tile binning.
+ */
+ uint32_t tile_alloc_size = (job->draw_tiles_x *
+ job->draw_tiles_y) * 64;
+ /* The PTB allocates in aligned 4k chunks after the initial setup. */
+ tile_alloc_size = align(tile_alloc_size, 4096);
+
+ /* Include the first two chunk allocations that the PTB does so that
+ * we definitely clear the OOM condition before triggering one (the HW
+ * won't trigger OOM during the first allocations).
+ */
+ tile_alloc_size += 8192;
+
+ /* For performance, allocate some extra initial memory after the PTB's
+ * minimal allocations, so that we hopefully don't have to block the
+ * GPU on the kernel handling an OOM signal.
+ */
+ tile_alloc_size += 512 * 1024;
+
+ job->tile_alloc = v3d_bo_alloc(v3d->screen, tile_alloc_size,
+ "tile_alloc");
uint32_t tsda_per_tile_size = v3d->screen->devinfo.ver >= 40 ? 256 : 64;
job->tile_state = v3d_bo_alloc(v3d->screen,
job->draw_tiles_y *
@@ -203,8 +224,13 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d,
* shader needs to write the Z value (even just discards).
*/
shader.fragment_shader_does_z_writes =
- (v3d->prog.fs->prog_data.fs->writes_z ||
- v3d->prog.fs->prog_data.fs->discard);
+ v3d->prog.fs->prog_data.fs->writes_z;
+ /* Set if the EZ test must be disabled (due to shader side
+ * effects and the early_z flag not being present in the
+ * shader).
+ */
+ shader.turn_off_early_z_test =
+ v3d->prog.fs->prog_data.fs->disable_ez;
shader.fragment_shader_uses_real_pixel_centre_w_in_addition_to_centroid_w2 =
v3d->prog.fs->prog_data.fs->uses_center_w;
diff --git a/src/gallium/drivers/v3d/v3dx_state.c b/src/gallium/drivers/v3d/v3dx_state.c
index f326b5379ba..eff6bcfca06 100644
--- a/src/gallium/drivers/v3d/v3dx_state.c
+++ b/src/gallium/drivers/v3d/v3dx_state.c
@@ -846,6 +846,9 @@ v3d_setup_texture_shader_state(struct V3DX(TEXTURE_SHADER_STATE) *tex,
prsc->target == PIPE_TEXTURE_1D_ARRAY) {
tex->image_height = tex->image_width >> 14;
}
+
+ tex->image_width &= (1 << 14) - 1;
+ tex->image_height &= (1 << 14) - 1;
#endif
if (prsc->target == PIPE_TEXTURE_3D) {
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index 2d0a52bb5fb..8f1e561c444 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -1591,7 +1591,7 @@ vc4_optimize_nir(struct nir_shader *s)
NIR_PASS(progress, s, nir_opt_dce);
NIR_PASS(progress, s, nir_opt_dead_cf);
NIR_PASS(progress, s, nir_opt_cse);
- NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);
+ NIR_PASS(progress, s, nir_opt_peephole_select, 8, true);
NIR_PASS(progress, s, nir_opt_algebraic);
NIR_PASS(progress, s, nir_opt_constant_folding);
NIR_PASS(progress, s, nir_opt_undef);
diff --git a/src/gallium/drivers/vc4/vc4_query.c b/src/gallium/drivers/vc4/vc4_query.c
index 6e4681e93cc..f08785f457f 100644
--- a/src/gallium/drivers/vc4/vc4_query.c
+++ b/src/gallium/drivers/vc4/vc4_query.c
@@ -132,7 +132,7 @@ vc4_create_batch_query(struct pipe_context *pctx, unsigned num_queries,
/* We can't mix HW and non-HW queries. */
if (nhwqueries && nhwqueries != num_queries)
- return NULL;
+ goto err_free_query;
if (!nhwqueries)
return (struct pipe_query *)query;
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index e7f7c82c271..acb4a1feb0d 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -178,6 +178,9 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
/* Note: Not supported in hardware, just faking it. */
return 5;
+ case PIPE_CAP_MAX_VARYINGS:
+ return 8;
+
case PIPE_CAP_VENDOR_ID:
return 0x14E4;
case PIPE_CAP_ACCELERATED:
diff --git a/src/gallium/drivers/virgl/virgl_context.c b/src/gallium/drivers/virgl/virgl_context.c
index 08f85f8574a..f9d8e231a13 100644
--- a/src/gallium/drivers/virgl/virgl_context.c
+++ b/src/gallium/drivers/virgl/virgl_context.c
@@ -765,7 +765,6 @@ static void virgl_flush_from_st(struct pipe_context *ctx,
enum pipe_flush_flags flags)
{
struct virgl_context *vctx = virgl_context(ctx);
- struct virgl_screen *rs = virgl_screen(ctx->screen);
if (flags & PIPE_FLUSH_FENCE_FD)
vctx->cbuf->needs_out_fence_fd = true;
diff --git a/src/gallium/drivers/virgl/virgl_screen.c b/src/gallium/drivers/virgl/virgl_screen.c
index 42e0987e0c9..17fa5fc51cc 100644
--- a/src/gallium/drivers/virgl/virgl_screen.c
+++ b/src/gallium/drivers/virgl/virgl_screen.c
@@ -258,6 +258,10 @@ virgl_get_param(struct pipe_screen *screen, enum pipe_cap param)
case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
return 1; /* TODO: need to introduce a hw-cap for this */
+ case PIPE_CAP_MAX_VARYINGS:
+ if (vscreen->caps.caps.v1.glsl_level < 150)
+ return vscreen->caps.caps.v2.max_vertex_attribs;
+ return 32;
case PIPE_CAP_TEXTURE_GATHER_SM5:
case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
case PIPE_CAP_FAKE_SW_MSAA:
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index 867d0cb5d74..96e8fbed1be 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -856,6 +856,7 @@ enum pipe_cap
PIPE_CAP_QUERY_PIPELINE_STATISTICS_SINGLE,
PIPE_CAP_RGB_OVERRIDE_DST_ALPHA_BLEND,
PIPE_CAP_DEST_SURFACE_SRGB_CONTROL,
+ PIPE_CAP_MAX_VARYINGS,
};
/**
diff --git a/src/gallium/include/pipe/p_video_enums.h b/src/gallium/include/pipe/p_video_enums.h
index b5b8b062285..adbe7858d0f 100644
--- a/src/gallium/include/pipe/p_video_enums.h
+++ b/src/gallium/include/pipe/p_video_enums.h
@@ -70,7 +70,8 @@ enum pipe_video_profile
PIPE_VIDEO_PROFILE_HEVC_MAIN_444,
PIPE_VIDEO_PROFILE_JPEG_BASELINE,
PIPE_VIDEO_PROFILE_VP9_PROFILE0,
- PIPE_VIDEO_PROFILE_VP9_PROFILE2
+ PIPE_VIDEO_PROFILE_VP9_PROFILE2,
+ PIPE_VIDEO_PROFILE_MAX
};
/* Video caps, can be different for each codec/profile */
diff --git a/src/gallium/state_trackers/dri/dri2.c b/src/gallium/state_trackers/dri/dri2.c
index ebbbabb6492..930d440a1e2 100644
--- a/src/gallium/state_trackers/dri/dri2.c
+++ b/src/gallium/state_trackers/dri/dri2.c
@@ -994,11 +994,6 @@ dri2_create_image_common(__DRIscreen *_screen,
if (!map)
return NULL;
- /* createImageWithModifiers doesn't supply usage, and we should not get
- * here with both modifiers and a usage flag.
- */
- assert(!(use && (modifiers != NULL)));
-
tex_usage = PIPE_BIND_RENDER_TARGET | PIPE_BIND_SAMPLER_VIEW;
if (use & __DRI_IMAGE_USE_SCANOUT)
@@ -1071,7 +1066,7 @@ dri2_create_image_with_modifiers(__DRIscreen *dri_screen,
void *loaderPrivate)
{
return dri2_create_image_common(dri_screen, width, height, format,
- 0 /* use */, modifiers, count,
+ __DRI_IMAGE_USE_SHARE, modifiers, count,
loaderPrivate);
}
diff --git a/src/gallium/state_trackers/glx/xlib/meson.build b/src/gallium/state_trackers/glx/xlib/meson.build
index f4ee75426bc..34b93c94cf2 100644
--- a/src/gallium/state_trackers/glx/xlib/meson.build
+++ b/src/gallium/state_trackers/glx/xlib/meson.build
@@ -23,5 +23,5 @@ libxlib = static_library(
files('glx_api.c', 'glx_getproc.c', 'glx_usefont.c', 'xm_api.c', 'xm_st.c'),
c_args : c_vis_args,
include_directories : [inc_common, inc_mapi, inc_mesa],
- dependencies : [dep_x11, dep_xext, dep_xcb],
+ dependencies : [dep_x11, dep_xext, dep_xcb, dep_glproto],
)
diff --git a/src/gallium/state_trackers/nine/nine_pipe.h b/src/gallium/state_trackers/nine/nine_pipe.h
index 7b68c09c47a..0595da5535a 100644
--- a/src/gallium/state_trackers/nine/nine_pipe.h
+++ b/src/gallium/state_trackers/nine/nine_pipe.h
@@ -377,6 +377,10 @@ d3dmultisample_type_check(struct pipe_screen *screen,
if (levels)
*levels = 1;
+ /* Ignores multisamplequality */
+ if (*multisample == D3DMULTISAMPLE_NONE)
+ return D3D_OK;
+
if (*multisample == D3DMULTISAMPLE_NONMASKABLE) {
if (depth_stencil_format(format))
bind = d3d9_get_pipe_depth_format_bindings(format);
diff --git a/src/gallium/state_trackers/nine/swapchain9.c b/src/gallium/state_trackers/nine/swapchain9.c
index 6c22be24c7c..8026ee16b7a 100644
--- a/src/gallium/state_trackers/nine/swapchain9.c
+++ b/src/gallium/state_trackers/nine/swapchain9.c
@@ -133,6 +133,13 @@ D3DWindowBuffer_release(struct NineSwapChain9 *This,
D3DWindowBuffer *present_handle)
{
int i;
+
+ /* IsBufferReleased API not available */
+ if (This->base.device->minor_version_num <= 2) {
+ ID3DPresent_DestroyD3DWindowBuffer(This->present, present_handle);
+ return;
+ }
+
/* Add it to the 'pending release' list */
for (i = 0; i < D3DPRESENT_BACK_BUFFERS_MAX_EX + 1; i++) {
if (!This->present_handles_pending_release[i]) {
@@ -750,9 +757,19 @@ present( struct NineSwapChain9 *This,
if (This->params.SwapEffect == D3DSWAPEFFECT_DISCARD)
handle_draw_cursor_and_hud(This, resource);
- ID3DPresent_GetWindowInfo(This->present, hDestWindowOverride, &target_width, &target_height, &target_depth);
+ hr = ID3DPresent_GetWindowInfo(This->present, hDestWindowOverride, &target_width, &target_height, &target_depth);
(void)target_depth;
+ /* Can happen with old Wine (presentation can still succeed),
+ * or at window destruction.
+ * Also disable for very old wine as D3DWindowBuffer_release
+ * cannot do the DestroyD3DWindowBuffer workaround. */
+ if (FAILED(hr) || target_width == 0 || target_height == 0 ||
+ This->base.device->minor_version_num <= 2) {
+ target_width = resource->width0;
+ target_height = resource->height0;
+ }
+
/* Switch to using presentation buffers on window resize.
* Note: Most apps should resize the d3d back buffers when
* a window resize is detected, which will result in a call to
diff --git a/src/gallium/state_trackers/va/context.c b/src/gallium/state_trackers/va/context.c
index 14e904ee490..47a5e7be230 100644
--- a/src/gallium/state_trackers/va/context.c
+++ b/src/gallium/state_trackers/va/context.c
@@ -175,7 +175,7 @@ VA_DRIVER_INIT_FUNC(VADriverContextP ctx)
ctx->version_minor = 1;
*ctx->vtable = vtable;
*ctx->vtable_vpp = vtable_vpp;
- ctx->max_profiles = PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH - PIPE_VIDEO_PROFILE_UNKNOWN;
+ ctx->max_profiles = PIPE_VIDEO_PROFILE_MAX - PIPE_VIDEO_PROFILE_UNKNOWN - 1;
ctx->max_entrypoints = 2;
ctx->max_attributes = 1;
ctx->max_image_formats = VL_VA_MAX_IMAGE_FORMATS;
diff --git a/src/gallium/state_trackers/va/picture_mpeg12.c b/src/gallium/state_trackers/va/picture_mpeg12.c
index 1e5a9c7428d..daf95f7403c 100644
--- a/src/gallium/state_trackers/va/picture_mpeg12.c
+++ b/src/gallium/state_trackers/va/picture_mpeg12.c
@@ -27,6 +27,19 @@
#include "va_private.h"
+const int reverse_inverse_zscan[] =
+{
+ /* Reverse inverse z scan pattern */
+ 0, 2, 3, 9, 10, 20, 21, 35,
+ 1, 4, 8, 11, 19, 22, 34, 36,
+ 5, 7, 12, 18, 23, 33, 37, 48,
+ 6, 13, 17, 24, 32, 38, 47, 49,
+ 14, 16, 25, 31, 39, 46, 50, 57,
+ 15, 26, 30, 40, 45, 51, 56, 58,
+ 27, 29, 41, 44, 52, 55, 59, 62,
+ 28, 42, 43, 53, 54, 60, 61, 63,
+};
+
void vlVaHandlePictureParameterBufferMPEG12(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf)
{
VAPictureParameterBufferMPEG2 *mpeg2 = buf->data;
@@ -66,16 +79,29 @@ void vlVaHandlePictureParameterBufferMPEG12(vlVaDriver *drv, vlVaContext *contex
void vlVaHandleIQMatrixBufferMPEG12(vlVaContext *context, vlVaBuffer *buf)
{
VAIQMatrixBufferMPEG2 *mpeg2 = buf->data;
+ static uint8_t temp_intra_matrix[64];
+ static uint8_t temp_nonintra_matrix[64];
assert(buf->size >= sizeof(VAIQMatrixBufferMPEG2) && buf->num_elements == 1);
- if (mpeg2->load_intra_quantiser_matrix)
- context->desc.mpeg12.intra_matrix = mpeg2->intra_quantiser_matrix;
- else
+ if (mpeg2->load_intra_quantiser_matrix) {
+ /* The quantiser matrix that VAAPI provides has been applied
+ with inverse z-scan. However, what we expect in MPEG2
+ picture description is the original order. Therefore,
+ we need to reverse it back to its original order.
+ */
+ for (int i = 0; i < 64; i++)
+ temp_intra_matrix[i] =
+ mpeg2->intra_quantiser_matrix[reverse_inverse_zscan[i]];
+ context->desc.mpeg12.intra_matrix = temp_intra_matrix;
+ } else
context->desc.mpeg12.intra_matrix = NULL;
- if (mpeg2->load_non_intra_quantiser_matrix)
- context->desc.mpeg12.non_intra_matrix = mpeg2->non_intra_quantiser_matrix;
- else
+ if (mpeg2->load_non_intra_quantiser_matrix) {
+ for (int i = 0; i < 64; i++)
+ temp_nonintra_matrix[i] =
+ mpeg2->non_intra_quantiser_matrix[reverse_inverse_zscan[i]];
+ context->desc.mpeg12.non_intra_matrix = temp_nonintra_matrix;
+ } else
context->desc.mpeg12.non_intra_matrix = NULL;
}
diff --git a/src/gallium/state_trackers/va/picture_vp9.c b/src/gallium/state_trackers/va/picture_vp9.c
index c1ca54cd008..b5aca9a513c 100644
--- a/src/gallium/state_trackers/va/picture_vp9.c
+++ b/src/gallium/state_trackers/va/picture_vp9.c
@@ -28,6 +28,8 @@
#include "vl/vl_vlc.h"
#include "va_private.h"
+#define NUM_VP9_REFS 8
+
void vlVaHandlePictureParameterBufferVP9(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf)
{
VADecPictureParameterBufferVP9 *vp9 = buf->data;
@@ -79,8 +81,11 @@ void vlVaHandlePictureParameterBufferVP9(vlVaDriver *drv, vlVaContext *context,
context->desc.vp9.picture_parameter.bit_depth = vp9->bit_depth;
- for (i = 0 ; i < 8 ; i++)
+ for (i = 0 ; i < NUM_VP9_REFS ; i++)
vlVaGetReferenceFrame(drv, vp9->reference_frames[i], &context->desc.vp9.ref[i]);
+
+ if (!context->decoder && !context->templat.max_references)
+ context->templat.max_references = NUM_VP9_REFS;
}
void vlVaHandleSliceParameterBufferVP9(vlVaContext *context, vlVaBuffer *buf)
diff --git a/src/gallium/state_trackers/xvmc/attributes.c b/src/gallium/state_trackers/xvmc/attributes.c
index 375705669b0..6e4d78a9a29 100644
--- a/src/gallium/state_trackers/xvmc/attributes.c
+++ b/src/gallium/state_trackers/xvmc/attributes.c
@@ -90,15 +90,15 @@ Status XvMCSetAttribute(Display *dpy, XvMCContext *context, Atom attribute, int
if (!attr)
return XvMCBadContext;
- if (strcmp(attr, XV_BRIGHTNESS))
+ if (strcmp(attr, XV_BRIGHTNESS) == 0)
context_priv->procamp.brightness = value / 1000.0f;
- else if (strcmp(attr, XV_CONTRAST))
+ else if (strcmp(attr, XV_CONTRAST) == 0)
context_priv->procamp.contrast = value / 1000.0f + 1.0f;
- else if (strcmp(attr, XV_SATURATION))
+ else if (strcmp(attr, XV_SATURATION) == 0)
context_priv->procamp.saturation = value / 1000.0f + 1.0f;
- else if (strcmp(attr, XV_HUE))
+ else if (strcmp(attr, XV_HUE) == 0)
context_priv->procamp.hue = value / 1000.0f;
- else if (strcmp(attr, XV_COLORSPACE))
+ else if (strcmp(attr, XV_COLORSPACE) == 0)
context_priv->color_standard = value ?
VL_CSC_COLOR_STANDARD_BT_601 :
VL_CSC_COLOR_STANDARD_BT_709;
@@ -134,15 +134,15 @@ Status XvMCGetAttribute(Display *dpy, XvMCContext *context, Atom attribute, int
if (!attr)
return XvMCBadContext;
- if (strcmp(attr, XV_BRIGHTNESS))
+ if (strcmp(attr, XV_BRIGHTNESS) == 0)
*value = context_priv->procamp.brightness * 1000;
- else if (strcmp(attr, XV_CONTRAST))
+ else if (strcmp(attr, XV_CONTRAST) == 0)
*value = context_priv->procamp.contrast * 1000 - 1000;
- else if (strcmp(attr, XV_SATURATION))
+ else if (strcmp(attr, XV_SATURATION) == 0)
*value = context_priv->procamp.saturation * 1000 + 1000;
- else if (strcmp(attr, XV_HUE))
+ else if (strcmp(attr, XV_HUE) == 0)
*value = context_priv->procamp.hue * 1000;
- else if (strcmp(attr, XV_COLORSPACE))
+ else if (strcmp(attr, XV_COLORSPACE) == 0)
*value = context_priv->color_standard == VL_CSC_COLOR_STANDARD_BT_709;
else
return BadName;
diff --git a/src/gallium/state_trackers/xvmc/tests/xvmc_bench.c b/src/gallium/state_trackers/xvmc/tests/xvmc_bench.c
index 3cd23173c7c..dbd705639f6 100644
--- a/src/gallium/state_trackers/xvmc/tests/xvmc_bench.c
+++ b/src/gallium/state_trackers/xvmc/tests/xvmc_bench.c
@@ -123,11 +123,11 @@ void ParseArgs(int argc, char **argv, struct Config *config)
while (token && !fail)
{
- if (strcmp(token, "i"))
+ if (strcmp(token, "i") == 0)
config->mb_types |= MB_TYPE_I;
- else if (strcmp(token, "p"))
+ else if (strcmp(token, "p") == 0)
config->mb_types |= MB_TYPE_P;
- else if (strcmp(token, "b"))
+ else if (strcmp(token, "b") == 0)
config->mb_types |= MB_TYPE_B;
else
fail = 1;
diff --git a/src/gallium/targets/d3dadapter9/meson.build b/src/gallium/targets/d3dadapter9/meson.build
index bc72b1110a0..b3c31c5dc6d 100644
--- a/src/gallium/targets/d3dadapter9/meson.build
+++ b/src/gallium/targets/d3dadapter9/meson.build
@@ -68,5 +68,5 @@ pkg.generate(
description : 'Native D3D driver modules',
version : '.'.join(nine_version),
requires_private : 'libdrm >= ' + dep_libdrm.version(),
- variables : ['moduledir=${prefix}/@0@'.format(d3d_drivers_path)],
+ variables : ['moduledir=@0@'.format(d3d_drivers_path)],
)
diff --git a/src/gallium/targets/dri/Android.mk b/src/gallium/targets/dri/Android.mk
index 9c43fa1e8fd..6134251b5ca 100644
--- a/src/gallium/targets/dri/Android.mk
+++ b/src/gallium/targets/dri/Android.mk
@@ -40,12 +40,23 @@ LOCAL_LDFLAGS := \
-Wl,--undefined-version
LOCAL_SHARED_LIBRARIES := \
- libbacktrace \
libdl \
libglapi \
- libexpat \
libz
+# If Android version >=8 MESA should static link libexpat else should dynamic link
+ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0)
+LOCAL_STATIC_LIBRARIES := \
+ libexpat
+else
+LOCAL_SHARED_LIBRARIES += \
+ libexpat
+endif
+
+ifeq ($(USE_LIBBACKTRACE),true)
+ LOCAL_SHARED_LIBRARIES += libbacktrace
+endif
+
$(foreach d, $(MESA_BUILD_GALLIUM), $(eval LOCAL_CFLAGS += $(patsubst HAVE_%,-D%,$(d))))
# sort GALLIUM_LIBS to remove any duplicates
diff --git a/src/gallium/targets/dri/meson.build b/src/gallium/targets/dri/meson.build
index 68d226621b2..edd0c007e48 100644
--- a/src/gallium/targets/dri/meson.build
+++ b/src/gallium/targets/dri/meson.build
@@ -60,6 +60,10 @@ libgallium_dri = shared_library(
driver_tegra, driver_i915, driver_svga, driver_virgl,
driver_swr,
],
+ # Will be deleted during installation, see install_megadrivers.py
+ install : true,
+ install_dir : dri_drivers_path,
+ name_suffix : 'so',
)
foreach d : [[with_gallium_kmsro, 'pl111_dri.so'],
diff --git a/src/gallium/targets/omx/meson.build b/src/gallium/targets/omx/meson.build
index 6811e6ff904..7772ae47bb5 100644
--- a/src/gallium/targets/omx/meson.build
+++ b/src/gallium/targets/omx/meson.build
@@ -32,7 +32,7 @@ endif
libomx_gallium = shared_library(
'omx_mesa',
- 'target.c',
+ ['target.c', xmlpool_options_h],
c_args : c_vis_args,
cpp_args : cpp_vis_args,
link_args : [omx_link_args, ld_args_gc_sections],
diff --git a/src/gallium/targets/osmesa/meson.build b/src/gallium/targets/osmesa/meson.build
index b4ae8f4b6ec..e873e311aa0 100644
--- a/src/gallium/targets/osmesa/meson.build
+++ b/src/gallium/targets/osmesa/meson.build
@@ -43,9 +43,9 @@ libosmesa = shared_library(
inc_gallium_drivers,
],
link_depends : osmesa_link_deps,
- link_whole : [libosmesa_st],
+ link_whole : [libosmesa_st, libglapi_static],
link_with : [
- libmesa_gallium, libgallium, libglapi_static, libws_null, osmesa_link_with,
+ libmesa_gallium, libgallium, libws_null, osmesa_link_with,
],
dependencies : [
dep_selinux, dep_thread, dep_clock, dep_unwind,
diff --git a/src/gallium/targets/va/meson.build b/src/gallium/targets/va/meson.build
index ded689b464d..4bfb5cbab7a 100644
--- a/src/gallium/targets/va/meson.build
+++ b/src/gallium/targets/va/meson.build
@@ -33,7 +33,7 @@ endif
libva_gallium = shared_library(
'gallium_drv_video',
- 'target.c',
+ ['target.c', xmlpool_options_h],
c_args : c_vis_args,
cpp_args : cpp_vis_args,
link_args : [va_link_args, ld_args_gc_sections],
@@ -49,8 +49,10 @@ libva_gallium = shared_library(
dep_libdrm, dep_thread, driver_r600, driver_radeonsi, driver_nouveau,
],
link_depends : va_link_depends,
+ # Will be deleted during installation, see install_megadrivers.py
install : true,
install_dir : va_drivers_path,
+ name_suffix : 'so',
)
foreach d : [[with_gallium_r600, 'r600'],
diff --git a/src/gallium/targets/vdpau/meson.build b/src/gallium/targets/vdpau/meson.build
index 22e3f5ffdd8..48f01ffba6c 100644
--- a/src/gallium/targets/vdpau/meson.build
+++ b/src/gallium/targets/vdpau/meson.build
@@ -38,7 +38,7 @@ endif
libvdpau_gallium = shared_library(
'vdpau_gallium',
- 'target.c',
+ ['target.c', xmlpool_options_h],
c_args : c_vis_args,
cpp_args : cpp_vis_args,
link_args : [vdpau_link_args, ld_args_gc_sections],
@@ -55,6 +55,10 @@ libvdpau_gallium = shared_library(
],
link_depends : vdpau_link_depends,
soversion : '@0@.@1@.0'.format(VDPAU_MAJOR, VDPAU_MINOR),
+ # Will be deleted during installation, see install_megadrivers.py
+ install : true,
+ install_dir : vdpau_drivers_path,
+ name_suffix : 'so',
)
foreach d : [[with_gallium_r300, 'r300'],
[with_gallium_r600, 'r600'],
diff --git a/src/gallium/targets/xa/meson.build b/src/gallium/targets/xa/meson.build
index 733ef54ff85..582d5ef67f6 100644
--- a/src/gallium/targets/xa/meson.build
+++ b/src/gallium/targets/xa/meson.build
@@ -34,7 +34,7 @@ _xa_version = '.'.join(xa_version)
libxatracker = shared_library(
'xatracker',
- 'target.c',
+ ['target.c', xmlpool_options_h],
c_args : c_vis_args,
cpp_args : cpp_vis_args,
link_args : [xa_link_args, ld_args_gc_sections],
diff --git a/src/gallium/targets/xvmc/meson.build b/src/gallium/targets/xvmc/meson.build
index 0af5b6477ce..537275aab57 100644
--- a/src/gallium/targets/xvmc/meson.build
+++ b/src/gallium/targets/xvmc/meson.build
@@ -33,7 +33,7 @@ endif
libxvmc_gallium = shared_library(
'XvMCgallium',
- 'target.c',
+ ['target.c', xmlpool_options_h],
c_args : c_vis_args,
cpp_args : cpp_vis_args,
link_args : [xvmc_link_args, ld_args_gc_sections],
@@ -47,6 +47,10 @@ libxvmc_gallium = shared_library(
],
dependencies : [dep_thread, driver_r600, driver_nouveau],
link_depends : xvmc_link_depends,
+ # Will be deleted during installation, see install_megadrivers.py
+ install : true,
+ install_dir : xvmc_drivers_path,
+ name_suffix : 'so',
)
foreach d : [[with_gallium_r600, 'r600'], [with_gallium_nouveau, 'nouveau']]
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index b4e62acbae4..2e595e5a1b0 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -386,7 +386,8 @@ static bool amdgpu_cs_has_user_fence(struct amdgpu_cs_context *cs)
cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCE &&
cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_UVD_ENC &&
cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCN_DEC &&
- cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCN_ENC;
+ cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCN_ENC &&
+ cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCN_JPEG;
}
static bool amdgpu_cs_has_chaining(struct amdgpu_cs *cs)
@@ -1219,8 +1220,6 @@ static void amdgpu_add_fence_dependencies_bo_lists(struct amdgpu_cs *acs)
{
struct amdgpu_cs_context *cs = acs->csc;
- cs->num_fence_dependencies = 0;
-
amdgpu_add_fence_dependencies_bo_list(acs, cs->fence, cs->num_real_buffers, cs->real_buffers);
amdgpu_add_fence_dependencies_bo_list(acs, cs->fence, cs->num_slab_buffers, cs->slab_buffers);
amdgpu_add_fence_dependencies_bo_list(acs, cs->fence, cs->num_sparse_buffers, cs->sparse_buffers);
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
index 79d2c1345ef..45e54b4791d 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
@@ -92,6 +92,10 @@ static bool do_winsys_init(struct amdgpu_winsys *ws,
if (!ac_query_gpu_info(fd, ws->dev, &ws->info, &ws->amdinfo))
goto fail;
+ /* TODO: Enable this once the kernel handles it efficiently. */
+ if (ws->info.has_dedicated_vram)
+ ws->info.has_local_buffers = false;
+
handle_env_var_force_family(ws);
ws->addrlib = amdgpu_addr_create(&ws->info, &ws->amdinfo, &ws->info.max_alignment);
diff --git a/src/gallium/winsys/svga/drm/Makefile.sources b/src/gallium/winsys/svga/drm/Makefile.sources
index f82b0097b5b..191f0b88b4a 100644
--- a/src/gallium/winsys/svga/drm/Makefile.sources
+++ b/src/gallium/winsys/svga/drm/Makefile.sources
@@ -8,6 +8,8 @@ C_SOURCES := \
vmw_fence.c \
vmw_fence.h \
vmwgfx_drm.h \
+ vmw_msg.c \
+ vmw_msg.h \
vmw_screen.c \
vmw_screen_dri.c \
vmw_screen.h \
diff --git a/src/gallium/winsys/svga/drm/meson.build b/src/gallium/winsys/svga/drm/meson.build
index 24f67aca9ec..55266ce1623 100644
--- a/src/gallium/winsys/svga/drm/meson.build
+++ b/src/gallium/winsys/svga/drm/meson.build
@@ -23,6 +23,7 @@ files_svgadrm = files(
'vmw_buffer.c',
'vmw_context.c',
'vmw_fence.c',
+ 'vmw_msg.c',
'vmw_screen.c',
'vmw_screen_dri.c',
'vmw_screen_ioctl.c',
diff --git a/src/gallium/drivers/svga/svga_msg.c b/src/gallium/winsys/svga/drm/vmw_msg.c
old mode 100755
new mode 100644
similarity index 93%
rename from src/gallium/drivers/svga/svga_msg.c
rename to src/gallium/winsys/svga/drm/vmw_msg.c
index 8b63132cb57..8cce2241f36
--- a/src/gallium/drivers/svga/svga_msg.c
+++ b/src/gallium/winsys/svga/drm/vmw_msg.c
@@ -29,7 +29,8 @@
#include "util/u_memory.h"
#include "util/u_string.h"
#include "pipe/p_defines.h"
-#include "svga_msg.h"
+#include "svga_winsys.h"
+#include "vmw_msg.h"
#define MESSAGE_STATUS_SUCCESS 0x0001
@@ -83,7 +84,7 @@
port_num, magic, \
ax, bx, cx, dx, si, di) \
({ \
- __asm__ volatile ("inl %%dx, %%eax;" : \
+ __asm__ volatile ("inl %%dx, %%eax;" : \
"=a"(ax), \
"=b"(bx), \
"=c"(cx), \
@@ -128,7 +129,7 @@ typedef uint64_t VMW_REG;
port_num, magic, bp, \
ax, bx, cx, dx, si, di) \
({ \
- __asm__ volatile ("push %%rbp;" \
+ __asm__ volatile ("push %%rbp;" \
"movq %12, %%rbp;" \
"rep outsb;" \
"pop %%rbp;" : \
@@ -152,7 +153,7 @@ typedef uint64_t VMW_REG;
port_num, magic, bp, \
ax, bx, cx, dx, si, di) \
({ \
- __asm__ volatile ("push %%rbp;" \
+ __asm__ volatile ("push %%rbp;" \
"movq %12, %%rbp;" \
"rep insb;" \
"pop %%rbp" : \
@@ -183,7 +184,7 @@ typedef uint32_t VMW_REG;
port_num, magic, bp, \
ax, bx, cx, dx, si, di) \
({ \
- __asm__ volatile ("push %%ebp;" \
+ __asm__ volatile ("push %%ebp;" \
"mov %12, %%ebp;" \
"rep outsb;" \
"pop %%ebp;" : \
@@ -208,7 +209,7 @@ typedef uint32_t VMW_REG;
port_num, magic, bp, \
ax, bx, cx, dx, si, di) \
({ \
- __asm__ volatile ("push %%ebp;" \
+ __asm__ volatile ("push %%ebp;" \
"mov %12, %%ebp;" \
"rep insb;" \
"pop %%ebp" : \
@@ -252,7 +253,7 @@ typedef uint32_t VMW_REG;
(void) in_cx; (void) bp; \
(void) ax; (void) bx; (void) cx; \
(void) dx; (void) si; (void) di;
-
+
#define VMW_PORT_HB_IN(cmd, in_cx, in_si, in_di, \
port_num, magic, bp, \
@@ -283,7 +284,7 @@ struct rpc_channel {
/**
- * svga_open_channel
+ * vmw_open_channel
*
* @channel: RPC channel
* @protocol:
@@ -291,7 +292,7 @@ struct rpc_channel {
* Returns: PIPE_OK on success, PIPE_ERROR otherwise
*/
static enum pipe_error
-svga_open_channel(struct rpc_channel *channel, unsigned protocol)
+vmw_open_channel(struct rpc_channel *channel, unsigned protocol)
{
VMW_REG ax = 0, bx = 0, cx = 0, dx = 0, si = 0, di = 0;
@@ -321,7 +322,7 @@ svga_open_channel(struct rpc_channel *channel, unsigned protocol)
* Returns: PIPE_OK on success, PIPE_ERROR otherwises
*/
static enum pipe_error
-svga_close_channel(struct rpc_channel *channel)
+vmw_close_channel(struct rpc_channel *channel)
{
VMW_REG ax = 0, bx = 0, cx = 0, dx = 0, si, di;
@@ -344,7 +345,7 @@ svga_close_channel(struct rpc_channel *channel)
/**
- * svga_send_msg: Sends a message to the host
+ * vmw_send_msg: Sends a message to the host
*
* @channel: RPC channel
* @logmsg: NULL terminated string
@@ -352,7 +353,7 @@ svga_close_channel(struct rpc_channel *channel)
* Returns: PIPE_OK on success
*/
static enum pipe_error
-svga_send_msg(struct rpc_channel *channel, const char *msg)
+vmw_send_msg(struct rpc_channel *channel, const char *msg)
{
VMW_REG ax = 0, bx = 0, cx = 0, dx = 0, si, di, bp;
size_t msg_len = strlen(msg);
@@ -406,46 +407,42 @@ svga_send_msg(struct rpc_channel *channel, const char *msg)
/**
- * svga_host_log: Sends a log message to the host
+ * vmw_svga_winsys_host_log: Sends a log message to the host
*
* @log: NULL terminated string
*
- * Returns: PIPE_OK on success
*/
-enum pipe_error
-svga_host_log(const char *log)
+void
+vmw_svga_winsys_host_log(struct svga_winsys_screen *sws, const char *log)
{
struct rpc_channel channel;
char *msg;
int msg_len;
- enum pipe_error ret = PIPE_OK;
#ifdef MSG_NOT_IMPLEMENTED
- return ret;
+ return;
#endif
if (!log)
- return ret;
+ return;
msg_len = strlen(log) + strlen("log ") + 1;
msg = CALLOC(1, msg_len);
if (msg == NULL) {
debug_printf("Cannot allocate memory for log message\n");
- return PIPE_ERROR_OUT_OF_MEMORY;
+ return;
}
util_sprintf(msg, "log %s", log);
- if (svga_open_channel(&channel, RPCI_PROTOCOL_NUM) ||
- svga_send_msg(&channel, msg) ||
- svga_close_channel(&channel)) {
+ if (vmw_open_channel(&channel, RPCI_PROTOCOL_NUM) ||
+ vmw_send_msg(&channel, msg) ||
+ vmw_close_channel(&channel)) {
debug_printf("Failed to send log\n");
-
- ret = PIPE_ERROR;
}
FREE(msg);
- return ret;
+ return;
}
diff --git a/src/gallium/drivers/svga/svga_msg.h b/src/gallium/winsys/svga/drm/vmw_msg.h
similarity index 89%
rename from src/gallium/drivers/svga/svga_msg.h
rename to src/gallium/winsys/svga/drm/vmw_msg.h
index 9132ba7e240..57057f23638 100644
--- a/src/gallium/drivers/svga/svga_msg.h
+++ b/src/gallium/winsys/svga/drm/vmw_msg.h
@@ -26,17 +26,16 @@
* Author:
* Sinclair Yeh
*/
-#ifndef _SVGA_MSG_H
-#define _SVGA_MSG_H
+#ifndef _VMW_MSG_H
+#define _VMW_MSG_H
/**
- * svga_host_log: Sends a log message to the host
+ * vmw_host_log: Sends a log message to the host
*
* @log: NULL terminated string
*
- * Returns: PIPE_OK on success
*/
-enum pipe_error svga_host_log(const char *log);
+void vmw_svga_winsys_host_log(struct svga_winsys_screen *sws, const char *log);
#endif
diff --git a/src/gallium/winsys/svga/drm/vmw_screen_ioctl.c b/src/gallium/winsys/svga/drm/vmw_screen_ioctl.c
index 0ec8c1abe11..581083f521a 100644
--- a/src/gallium/winsys/svga/drm/vmw_screen_ioctl.c
+++ b/src/gallium/winsys/svga/drm/vmw_screen_ioctl.c
@@ -210,6 +210,10 @@ vmw_ioctl_gb_surface_create(struct vmw_winsys_screen *vws,
SVGA3dMSQualityLevel qualityLevel,
struct vmw_region **p_region)
{
+ union {
+ union drm_vmw_gb_surface_create_ext_arg ext_arg;
+ union drm_vmw_gb_surface_create_arg arg;
+ } s_arg;
struct drm_vmw_gb_surface_create_rep *rep;
struct vmw_region *region = NULL;
int ret;
@@ -222,12 +226,11 @@ vmw_ioctl_gb_surface_create(struct vmw_winsys_screen *vws,
return SVGA3D_INVALID_ID;
}
- if (vws->ioctl.have_drm_2_15) {
- union drm_vmw_gb_surface_create_ext_arg s_arg;
- struct drm_vmw_gb_surface_create_ext_req *req = &s_arg.req;
- rep = &s_arg.rep;
+ memset(&s_arg, 0, sizeof(s_arg));
- memset(&s_arg, 0, sizeof(s_arg));
+ if (vws->ioctl.have_drm_2_15) {
+ struct drm_vmw_gb_surface_create_ext_req *req = &s_arg.ext_arg.req;
+ rep = &s_arg.ext_arg.rep;
req->version = drm_vmw_gb_surface_v1;
req->multisample_pattern = multisamplePattern;
@@ -264,17 +267,15 @@ vmw_ioctl_gb_surface_create(struct vmw_winsys_screen *vws,
buffer_handle : SVGA3D_INVALID_ID;
ret = drmCommandWriteRead(vws->ioctl.drm_fd,
- DRM_VMW_GB_SURFACE_CREATE_EXT, &s_arg,
- sizeof(s_arg));
+ DRM_VMW_GB_SURFACE_CREATE_EXT, &s_arg.ext_arg,
+ sizeof(s_arg.ext_arg));
if (ret)
goto out_fail_create;
} else {
- union drm_vmw_gb_surface_create_arg s_arg;
- struct drm_vmw_gb_surface_create_req *req = &s_arg.req;
- rep = &s_arg.rep;
+ struct drm_vmw_gb_surface_create_req *req = &s_arg.arg.req;
+ rep = &s_arg.arg.rep;
- memset(&s_arg, 0, sizeof(s_arg));
req->svga3d_flags = (uint32_t) flags;
req->format = (uint32_t) format;
@@ -305,7 +306,7 @@ vmw_ioctl_gb_surface_create(struct vmw_winsys_screen *vws,
buffer_handle : SVGA3D_INVALID_ID;
ret = drmCommandWriteRead(vws->ioctl.drm_fd, DRM_VMW_GB_SURFACE_CREATE,
- &s_arg, sizeof(s_arg));
+ &s_arg.arg, sizeof(s_arg.arg));
if (ret)
goto out_fail_create;
diff --git a/src/gallium/winsys/svga/drm/vmw_screen_svga.c b/src/gallium/winsys/svga/drm/vmw_screen_svga.c
index a6990414e20..cd3f21f6033 100644
--- a/src/gallium/winsys/svga/drm/vmw_screen_svga.c
+++ b/src/gallium/winsys/svga/drm/vmw_screen_svga.c
@@ -48,6 +48,7 @@
#include "vmw_surface.h"
#include "vmw_buffer.h"
#include "vmw_fence.h"
+#include "vmw_msg.h"
#include "vmw_shader.h"
#include "vmw_query.h"
#include "svga3d_surfacedefs.h"
@@ -509,6 +510,8 @@ vmw_winsys_screen_init_svga(struct vmw_winsys_screen *vws)
vws->base.stats_time_push = vmw_svga_winsys_stats_time_push;
vws->base.stats_time_pop = vmw_svga_winsys_stats_time_pop;
+ vws->base.host_log = vmw_svga_winsys_host_log;
+
return TRUE;
}
diff --git a/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c b/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c
index 8753139107c..a4c1d50453b 100644
--- a/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c
+++ b/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c
@@ -396,6 +396,7 @@ xlib_displaytarget_create(struct sw_winsys *winsys,
{
struct xlib_displaytarget *xlib_dt;
unsigned nblocksy, size;
+ int ignore;
xlib_dt = CALLOC_STRUCT(xlib_displaytarget);
if (!xlib_dt)
@@ -410,7 +411,8 @@ xlib_displaytarget_create(struct sw_winsys *winsys,
xlib_dt->stride = align(util_format_get_stride(format, width), alignment);
size = xlib_dt->stride * nblocksy;
- if (!debug_get_option_xlib_no_shm()) {
+ if (!debug_get_option_xlib_no_shm() &&
+ XQueryExtension(xlib_dt->display, "MIT-SHM", &ignore, &ignore, &ignore)) {
xlib_dt->data = alloc_shm(xlib_dt, size);
if (xlib_dt->data) {
xlib_dt->shm = True;
diff --git a/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c b/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c
index 26de8c702df..a2d232a539c 100644
--- a/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c
+++ b/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c
@@ -46,7 +46,7 @@
#define VIRGL_DRM_VERSION(major, minor) ((major) << 16 | (minor))
-#define VIRGL_DRM_VERSION_FENCE_FD VIRGL_DRM_VERSION(1, 0)
+#define VIRGL_DRM_VERSION_FENCE_FD VIRGL_DRM_VERSION(0, 1)
static inline boolean can_cache_resource(struct virgl_hw_res *res)
@@ -870,7 +870,7 @@ static int virgl_drm_get_version(int fd)
else if (version->version_major != 0)
ret = -EINVAL;
else
- ret = version->version_minor;
+ ret = VIRGL_DRM_VERSION(0, version->version_minor);
drmFreeVersion(version);
diff --git a/src/glx/dri3_glx.c b/src/glx/dri3_glx.c
index 298adc80ef1..d53fc87e21e 100644
--- a/src/glx/dri3_glx.c
+++ b/src/glx/dri3_glx.c
@@ -642,7 +642,6 @@ dri3_set_swap_interval(__GLXDRIdrawable *pdraw, int interval)
break;
}
- priv->swap_interval = interval;
loader_dri3_set_swap_interval(&priv->loader_drawable, interval);
return 0;
@@ -659,7 +658,7 @@ dri3_get_swap_interval(__GLXDRIdrawable *pdraw)
struct dri3_drawable *priv = (struct dri3_drawable *) pdraw;
- return priv->swap_interval;
+ return priv->loader_drawable.swap_interval;
}
static void
diff --git a/src/glx/dri3_priv.h b/src/glx/dri3_priv.h
index 1d3c03f9997..32a8d3f7e7d 100644
--- a/src/glx/dri3_priv.h
+++ b/src/glx/dri3_priv.h
@@ -117,7 +117,6 @@ struct dri3_context
struct dri3_drawable {
__GLXDRIdrawable base;
struct loader_dri3_drawable loader_drawable;
- int swap_interval;
/* LIBGL_SHOW_FPS support */
uint64_t previous_ust;
diff --git a/src/glx/drisw_glx.c b/src/glx/drisw_glx.c
index 00c7fa100ab..48c03ca42e0 100644
--- a/src/glx/drisw_glx.c
+++ b/src/glx/drisw_glx.c
@@ -147,6 +147,9 @@ XDestroyDrawable(struct drisw_drawable * pdp, Display * dpy, XID drawable)
if (pdp->ximage)
XDestroyImage(pdp->ximage);
+ if (pdp->shminfo.shmid > 0)
+ XShmDetach(dpy, &pdp->shminfo);
+
free(pdp->visinfo);
XFreeGC(dpy, pdp->gc);
diff --git a/src/intel/Android.common.mk b/src/intel/Android.common.mk
index 12cea6e5472..79d9f1284a0 100644
--- a/src/intel/Android.common.mk
+++ b/src/intel/Android.common.mk
@@ -38,7 +38,17 @@ LOCAL_C_INCLUDES := \
$(MESA_TOP)/src/mapi \
$(MESA_TOP)/src/mesa
-LOCAL_SHARED_LIBRARIES := libexpat libz
+LOCAL_SHARED_LIBRARIES := libz liblog
+
+# If Android version >=8 MESA should static link libexpat else should dynamic link
+ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0)
+LOCAL_STATIC_LIBRARIES := \
+ libexpat
+else
+LOCAL_SHARED_LIBRARIES += \
+ libexpat
+endif
+
LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_genxml
diff --git a/src/intel/Android.compiler.mk b/src/intel/Android.compiler.mk
index c2b01221dfc..41af7b20b9c 100644
--- a/src/intel/Android.compiler.mk
+++ b/src/intel/Android.compiler.mk
@@ -28,7 +28,7 @@
# ---------------------------------------
include $(CLEAR_VARS)
-
+LOCAL_CFLAGS += -Wno-error
LOCAL_MODULE := libmesa_intel_compiler
LOCAL_MODULE_CLASS := STATIC_LIBRARIES
diff --git a/src/intel/Android.dev.mk b/src/intel/Android.dev.mk
index cd2ed66a176..3011ee232ed 100644
--- a/src/intel/Android.dev.mk
+++ b/src/intel/Android.dev.mk
@@ -33,5 +33,8 @@ LOCAL_C_INCLUDES := $(MESA_TOP)/include/drm-uapi
LOCAL_SRC_FILES := $(DEV_FILES)
+LOCAL_CFLAGS := \
+ -Wno-gnu-variable-sized-type-not-at-end
+
include $(MESA_COMMON_MK)
include $(BUILD_STATIC_LIBRARY)
diff --git a/src/intel/Android.vulkan.mk b/src/intel/Android.vulkan.mk
index 7019c8cbc8f..73586803552 100644
--- a/src/intel/Android.vulkan.mk
+++ b/src/intel/Android.vulkan.mk
@@ -23,9 +23,10 @@ LOCAL_PATH := $(call my-dir)
include $(CLEAR_VARS)
include $(LOCAL_PATH)/Makefile.sources
-VK_ENTRYPOINTS_SCRIPT := $(MESA_PYTHON2) $(LOCAL_PATH)/vulkan/anv_entrypoints_gen.py
-
-VK_EXTENSIONS_SCRIPT := $(MESA_PYTHON2) $(LOCAL_PATH)/vulkan/anv_extensions_gen.py
+ANV_ENTRYPOINTS_GEN_SCRIPT := $(LOCAL_PATH)/vulkan/anv_entrypoints_gen.py
+ANV_EXTENSIONS_GEN_SCRIPT := $(LOCAL_PATH)/vulkan/anv_extensions_gen.py
+ANV_EXTENSIONS_SCRIPT := $(LOCAL_PATH)/vulkan/anv_extensions.py
+VULKAN_API_XML := $(MESA_TOP)/src/vulkan/registry/vk.xml
VULKAN_COMMON_INCLUDES := \
$(MESA_TOP)/include \
@@ -41,6 +42,18 @@ VULKAN_COMMON_INCLUDES := \
$(MESA_TOP)/src/compiler \
frameworks/native/vulkan/include
+ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0)
+VULKAN_COMMON_INCLUDES += \
+ frameworks/native/vulkan/include \
+ frameworks/native/libs/nativebase/include \
+ frameworks/native/libs/nativewindow/include \
+ frameworks/native/libs/arect/include
+
+VULKAN_COMMON_HEADER_LIBRARIES := \
+ libcutils_headers \
+ libhardware_headers
+endif
+
# libmesa_anv_entrypoints with header and dummy.c
#
# This static library is built to pull entrypoints header
@@ -59,16 +72,28 @@ LOCAL_C_INCLUDES := \
LOCAL_GENERATED_SOURCES += $(intermediates)/vulkan/anv_entrypoints.h
LOCAL_GENERATED_SOURCES += $(intermediates)/vulkan/dummy.c
+LOCAL_GENERATED_SOURCES += $(intermediates)/vulkan/anv_extensions.h
$(intermediates)/vulkan/dummy.c:
@mkdir -p $(dir $@)
@echo "Gen Dummy: $(PRIVATE_MODULE) <= $(notdir $(@))"
$(hide) touch $@
-$(intermediates)/vulkan/anv_entrypoints.h: $(intermediates)/vulkan/dummy.c
- $(VK_ENTRYPOINTS_SCRIPT) \
+$(intermediates)/vulkan/anv_entrypoints.h: $(intermediates)/vulkan/dummy.c \
+ $(ANV_ENTRYPOINTS_GEN_SCRIPT) \
+ $(ANV_EXTENSIONS_SCRIPT) \
+ $(VULKAN_API_XML)
+ $(MESA_PYTHON2) $(ANV_ENTRYPOINTS_GEN_SCRIPT) \
--outdir $(dir $@) \
- --xml $(MESA_TOP)/src/vulkan/registry/vk.xml
+ --xml $(VULKAN_API_XML)
+
+$(intermediates)/vulkan/anv_extensions.h: $(ANV_ENTRYPOINTS_GEN_SCRIPT) \
+ $(ANV_EXTENSIONS_SCRIPT) \
+ $(VULKAN_API_XML)
+ @mkdir -p $(dir $@)
+ $(MESA_PYTHON2) $(ANV_EXTENSIONS_GEN_SCRIPT) \
+ --xml $(VULKAN_API_XML) \
+ --out-h $@
LOCAL_EXPORT_C_INCLUDE_DIRS := \
$(intermediates)
@@ -107,6 +132,7 @@ LOCAL_C_INCLUDES := $(ANV_INCLUDES)
LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_anv_entrypoints libmesa_genxml
LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES)
+LOCAL_HEADER_LIBRARIES += $(VULKAN_COMMON_HEADER_LIBRARIES)
include $(MESA_COMMON_MK)
include $(BUILD_STATIC_LIBRARY)
@@ -127,6 +153,7 @@ LOCAL_C_INCLUDES := $(ANV_INCLUDES)
LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_anv_entrypoints libmesa_genxml
LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES)
+LOCAL_HEADER_LIBRARIES += $(VULKAN_COMMON_HEADER_LIBRARIES)
include $(MESA_COMMON_MK)
include $(BUILD_STATIC_LIBRARY)
@@ -147,6 +174,7 @@ LOCAL_C_INCLUDES := $(ANV_INCLUDES)
LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_anv_entrypoints libmesa_genxml
LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES)
+LOCAL_HEADER_LIBRARIES += $(VULKAN_COMMON_HEADER_LIBRARIES)
include $(MESA_COMMON_MK)
include $(BUILD_STATIC_LIBRARY)
@@ -167,6 +195,7 @@ LOCAL_C_INCLUDES := $(ANV_INCLUDES)
LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_anv_entrypoints libmesa_genxml
LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES)
+LOCAL_HEADER_LIBRARIES += $(VULKAN_COMMON_HEADER_LIBRARIES)
include $(MESA_COMMON_MK)
include $(BUILD_STATIC_LIBRARY)
@@ -187,6 +216,7 @@ LOCAL_C_INCLUDES := $(ANV_INCLUDES)
LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_anv_entrypoints libmesa_genxml
LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES)
+LOCAL_HEADER_LIBRARIES += $(VULKAN_COMMON_HEADER_LIBRARIES)
include $(MESA_COMMON_MK)
include $(BUILD_STATIC_LIBRARY)
@@ -207,6 +237,7 @@ LOCAL_C_INCLUDES := $(ANV_INCLUDES)
LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_anv_entrypoints libmesa_genxml
LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES)
+LOCAL_HEADER_LIBRARIES += $(VULKAN_COMMON_HEADER_LIBRARIES)
include $(MESA_COMMON_MK)
include $(BUILD_STATIC_LIBRARY)
@@ -218,7 +249,7 @@ include $(BUILD_STATIC_LIBRARY)
include $(CLEAR_VARS)
LOCAL_MODULE := libmesa_vulkan_common
LOCAL_MODULE_CLASS := STATIC_LIBRARIES
-
+LOCAL_CFLAGS += -Wno-error
intermediates := $(call local-generated-sources-dir)
LOCAL_SRC_FILES := $(VULKAN_FILES)
@@ -240,27 +271,25 @@ LOCAL_WHOLE_STATIC_LIBRARIES := \
LOCAL_GENERATED_SOURCES += $(intermediates)/vulkan/anv_entrypoints.c
LOCAL_GENERATED_SOURCES += $(intermediates)/vulkan/anv_extensions.c
-LOCAL_GENERATED_SOURCES += $(intermediates)/vulkan/anv_extensions.h
-$(intermediates)/vulkan/anv_entrypoints.c:
+$(intermediates)/vulkan/anv_entrypoints.c: $(ANV_ENTRYPOINTS_GEN_SCRIPT) \
+ $(ANV_EXTENSIONS_SCRIPT) \
+ $(VULKAN_API_XML)
@mkdir -p $(dir $@)
- $(VK_ENTRYPOINTS_SCRIPT) \
- --xml $(MESA_TOP)/src/vulkan/registry/vk.xml \
+ $(MESA_PYTHON2) $(ANV_ENTRYPOINTS_GEN_SCRIPT) \
+ --xml $(VULKAN_API_XML) \
--outdir $(dir $@)
-$(intermediates)/vulkan/anv_extensions.c:
+$(intermediates)/vulkan/anv_extensions.c: $(ANV_EXTENSIONS_GEN_SCRIPT) \
+ $(ANV_EXTENSIONS_SCRIPT) \
+ $(VULKAN_API_XML)
@mkdir -p $(dir $@)
- $(VK_EXTENSIONS_SCRIPT) \
- --xml $(MESA_TOP)/src/vulkan/registry/vk.xml \
+ $(MESA_PYTHON2) $(ANV_EXTENSIONS_GEN_SCRIPT) \
+ --xml $(VULKAN_API_XML) \
--out-c $@
-$(intermediates)/vulkan/anv_extensions.h:
- @mkdir -p $(dir $@)
- $(VK_EXTENSIONS_SCRIPT) \
- --xml $(MESA_TOP)/src/vulkan/registry/vk.xml \
- --out-h $@
-
LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES)
+LOCAL_HEADER_LIBRARIES += $(VULKAN_COMMON_HEADER_LIBRARIES)
include $(MESA_COMMON_MK)
include $(BUILD_STATIC_LIBRARY)
@@ -310,6 +339,16 @@ LOCAL_WHOLE_STATIC_LIBRARIES := \
libmesa_anv_entrypoints
LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES) libz libsync liblog
+LOCAL_HEADER_LIBRARIES += $(VULKAN_COMMON_HEADER_LIBRARIES)
+
+# If Android version >=8 MESA should static link libexpat else should dynamic link
+ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0)
+LOCAL_STATIC_LIBRARIES := \
+ libexpat
+else
+ LOCAL_SHARED_LIBRARIES += \
+ libexpat
+endif
include $(MESA_COMMON_MK)
include $(BUILD_SHARED_LIBRARY)
diff --git a/src/intel/Makefile.isl.am b/src/intel/Makefile.isl.am
index a6733f3ba8e..dcb9d3ad6fc 100644
--- a/src/intel/Makefile.isl.am
+++ b/src/intel/Makefile.isl.am
@@ -33,12 +33,15 @@ ISL_GEN_LIBS = \
noinst_LTLIBRARIES += $(ISL_GEN_LIBS) \
isl/libisl.la \
- libisl_tiled_memcpy.la \
- libisl_tiled_memcpy_sse41.la
+ libisl_tiled_memcpy.la
isl_libisl_la_LIBADD = $(ISL_GEN_LIBS) \
- libisl_tiled_memcpy.la \
- libisl_tiled_memcpy_sse41.la
+ libisl_tiled_memcpy.la
+
+if SSE41_SUPPORTED
+isl_libisl_la_LIBADD += libisl_tiled_memcpy_sse41.la
+noinst_LTLIBRARIES += libisl_tiled_memcpy_sse41.la
+endif
isl_libisl_la_SOURCES = $(ISL_FILES) $(ISL_GENERATED_FILES)
diff --git a/src/intel/Makefile.vulkan.am b/src/intel/Makefile.vulkan.am
index b315f10a01a..cad0a57bc7f 100644
--- a/src/intel/Makefile.vulkan.am
+++ b/src/intel/Makefile.vulkan.am
@@ -253,6 +253,7 @@ VULKAN_TESTS = \
vulkan/tests/block_pool_no_free \
vulkan/tests/state_pool_no_free \
vulkan/tests/state_pool_free_list_only \
+ vulkan/tests/state_pool_padding \
vulkan/tests/state_pool
VULKAN_TEST_LDADD = \
@@ -274,6 +275,10 @@ vulkan_tests_state_pool_free_list_only_CFLAGS = $(VULKAN_CFLAGS)
vulkan_tests_state_pool_free_list_only_CPPFLAGS = $(VULKAN_CPPFLAGS)
vulkan_tests_state_pool_free_list_only_LDADD = $(VULKAN_TEST_LDADD)
+vulkan_tests_state_pool_padding_CFLAGS = $(VULKAN_CFLAGS)
+vulkan_tests_state_pool_padding_CPPFLAGS = $(VULKAN_CPPFLAGS)
+vulkan_tests_state_pool_padding_LDADD = $(VULKAN_TEST_LDADD)
+
vulkan_tests_state_pool_CFLAGS = $(VULKAN_CFLAGS)
vulkan_tests_state_pool_CPPFLAGS = $(VULKAN_CPPFLAGS)
vulkan_tests_state_pool_LDADD = $(VULKAN_TEST_LDADD)
diff --git a/src/intel/blorp/meson.build b/src/intel/blorp/meson.build
index c1201b0aa16..ff68d255164 100644
--- a/src/intel/blorp/meson.build
+++ b/src/intel/blorp/meson.build
@@ -33,5 +33,5 @@ libblorp = static_library(
files_libblorp,
include_directories : [inc_common, inc_intel],
c_args : [c_vis_args, no_override_init_args],
- dependencies : idep_nir_headers,
+ dependencies : [idep_nir_headers, idep_genxml],
)
diff --git a/src/intel/common/gen_debug.c b/src/intel/common/gen_debug.c
index a978f2f5818..8990d208207 100644
--- a/src/intel/common/gen_debug.c
+++ b/src/intel/common/gen_debug.c
@@ -85,6 +85,7 @@ static const struct debug_control debug_control[] = {
{ "nohiz", DEBUG_NO_HIZ },
{ "color", DEBUG_COLOR },
{ "reemit", DEBUG_REEMIT },
+ { "heur32", DEBUG_HEUR32 },
{ NULL, 0 }
};
diff --git a/src/intel/common/gen_debug.h b/src/intel/common/gen_debug.h
index 72d7ca20a39..c2ca2e2ebd6 100644
--- a/src/intel/common/gen_debug.h
+++ b/src/intel/common/gen_debug.h
@@ -83,6 +83,7 @@ extern uint64_t INTEL_DEBUG;
#define DEBUG_NO_HIZ (1ull << 39)
#define DEBUG_COLOR (1ull << 40)
#define DEBUG_REEMIT (1ull << 41)
+#define DEBUG_HEUR32 (1ull << 42)
/* These flags are not compatible with the disk shader cache */
#define DEBUG_DISK_CACHE_DISABLE_MASK DEBUG_SHADER_TIME
@@ -90,7 +91,7 @@ extern uint64_t INTEL_DEBUG;
/* These flags may affect program generation */
#define DEBUG_DISK_CACHE_MASK \
(DEBUG_NO16 | DEBUG_NO_DUAL_OBJECT_GS | DEBUG_NO8 | DEBUG_SPILL_FS | \
- DEBUG_SPILL_VEC4 | DEBUG_NO_COMPACTION | DEBUG_DO32)
+ DEBUG_SPILL_VEC4 | DEBUG_NO_COMPACTION | DEBUG_DO32 | DEBUG_HEUR32)
#ifdef HAVE_ANDROID_PLATFORM
#define LOG_TAG "INTEL-MESA"
diff --git a/src/intel/common/meson.build b/src/intel/common/meson.build
index 332e978b0ad..ec45962502e 100644
--- a/src/intel/common/meson.build
+++ b/src/intel/common/meson.build
@@ -43,5 +43,5 @@ libintel_common = static_library(
include_directories : [inc_common, inc_intel],
c_args : [c_vis_args, no_override_init_args],
link_with : [libisl],
- dependencies : [dep_expat, dep_libdrm, dep_thread],
+ dependencies : [dep_expat, dep_libdrm, dep_thread, idep_genxml],
)
diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
index 61a4528d372..c294e5c3222 100644
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -38,6 +38,15 @@ struct ra_regs;
struct nir_shader;
struct brw_program;
+struct brw_simd32_heuristics_control {
+ bool grouped_sends_check;
+ int max_grouped_sends;
+ bool inst_count_check;
+ float inst_count_ratio;
+ bool mrt_check;
+ int max_mrts;
+};
+
struct brw_compiler {
const struct gen_device_info *devinfo;
@@ -118,6 +127,8 @@ struct brw_compiler {
* whether nir_opt_large_constants will be run.
*/
bool supports_shader_constants;
+
+ struct brw_simd32_heuristics_control simd32_heuristics_control;
};
/**
@@ -196,6 +207,9 @@ struct brw_sampler_prog_key_data {
uint32_t yx_xuxv_image_mask;
uint32_t xy_uxvx_image_mask;
uint32_t ayuv_image_mask;
+
+ /* Scale factor for each texture. */
+ float scale_factors[32];
};
/**
diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index 9be82d1b87c..a53ace32716 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -696,9 +696,9 @@ brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
gen7_convert_mrf_to_grf(p, &dest);
assert(dest.nr < 128);
- assert(src0.file != BRW_IMMEDIATE_VALUE || src0.nr < 128);
- assert(src1.file != BRW_IMMEDIATE_VALUE || src1.nr < 128);
- assert(src2.file != BRW_IMMEDIATE_VALUE || src2.nr < 128);
+ assert(src0.file == BRW_IMMEDIATE_VALUE || src0.nr < 128);
+ assert(src1.file != BRW_IMMEDIATE_VALUE && src1.nr < 128);
+ assert(src2.file == BRW_IMMEDIATE_VALUE || src2.nr < 128);
assert(dest.address_mode == BRW_ADDRESS_DIRECT);
assert(src0.address_mode == BRW_ADDRESS_DIRECT);
assert(src1.address_mode == BRW_ADDRESS_DIRECT);
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 8dd3b94fbd5..5b29292d6a0 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -251,6 +251,62 @@ fs_inst::is_send_from_grf() const
}
}
+bool
+fs_inst::is_control_source(unsigned arg) const
+{
+ switch (opcode) {
+ case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+ case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
+ case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
+ return arg == 0;
+
+ case SHADER_OPCODE_BROADCAST:
+ case SHADER_OPCODE_SHUFFLE:
+ case SHADER_OPCODE_QUAD_SWIZZLE:
+ case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
+ case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
+ case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
+ case SHADER_OPCODE_IMAGE_SIZE:
+ case SHADER_OPCODE_GET_BUFFER_SIZE:
+ return arg == 1;
+
+ case SHADER_OPCODE_MOV_INDIRECT:
+ case SHADER_OPCODE_CLUSTER_BROADCAST:
+ case SHADER_OPCODE_TEX:
+ case FS_OPCODE_TXB:
+ case SHADER_OPCODE_TXD:
+ case SHADER_OPCODE_TXF:
+ case SHADER_OPCODE_TXF_LZ:
+ case SHADER_OPCODE_TXF_CMS:
+ case SHADER_OPCODE_TXF_CMS_W:
+ case SHADER_OPCODE_TXF_UMS:
+ case SHADER_OPCODE_TXF_MCS:
+ case SHADER_OPCODE_TXL:
+ case SHADER_OPCODE_TXL_LZ:
+ case SHADER_OPCODE_TXS:
+ case SHADER_OPCODE_LOD:
+ case SHADER_OPCODE_TG4:
+ case SHADER_OPCODE_TG4_OFFSET:
+ case SHADER_OPCODE_SAMPLEINFO:
+ case SHADER_OPCODE_UNTYPED_ATOMIC:
+ case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT:
+ case SHADER_OPCODE_UNTYPED_SURFACE_READ:
+ case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
+ case SHADER_OPCODE_BYTE_SCATTERED_READ:
+ case SHADER_OPCODE_BYTE_SCATTERED_WRITE:
+ case SHADER_OPCODE_TYPED_ATOMIC:
+ case SHADER_OPCODE_TYPED_SURFACE_READ:
+ case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+ return arg == 1 || arg == 2;
+
+ case SHADER_OPCODE_SEND:
+ return arg == 0 || arg == 1;
+
+ default:
+ return false;
+ }
+}
+
/**
* Returns true if this instruction's sources and destinations cannot
* safely be the same register.
@@ -3061,6 +3117,7 @@ fs_visitor::opt_peephole_csel()
if (csel_inst != NULL) {
progress = true;
+ csel_inst->saturate = inst->saturate;
inst->remove(block);
}
@@ -3899,18 +3956,22 @@ fs_visitor::lower_integer_multiplication()
bool needs_mov = false;
fs_reg orig_dst = inst->dst;
+
+ /* Get a new VGRF for the "low" 32x16-bit multiplication result if
+ * reusing the original destination is impossible due to hardware
+ * restrictions, source/destination overlap, or it being the null
+ * register.
+ */
fs_reg low = inst->dst;
if (orig_dst.is_null() || orig_dst.file == MRF ||
regions_overlap(inst->dst, inst->size_written,
inst->src[0], inst->size_read(0)) ||
regions_overlap(inst->dst, inst->size_written,
- inst->src[1], inst->size_read(1))) {
+ inst->src[1], inst->size_read(1)) ||
+ inst->dst.stride >= 4) {
needs_mov = true;
- /* Get a new VGRF but keep the same stride as inst->dst */
low = fs_reg(VGRF, alloc.allocate(regs_written(inst)),
inst->dst.type);
- low.stride = inst->dst.stride;
- low.offset = inst->dst.offset % REG_SIZE;
}
/* Get a new VGRF but keep the same stride as inst->dst */
@@ -7542,6 +7603,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
char **error_str)
{
const struct gen_device_info *devinfo = compiler->devinfo;
+ bool simd16_failed = false;
+ bool simd16_spilled = false;
shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true);
brw_nir_lower_fs_inputs(shader, devinfo, key);
@@ -7608,10 +7671,12 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
shader_time_index16);
v16.import_uniforms(&v8);
if (!v16.run_fs(allow_spilling, use_rep_send)) {
+ simd16_failed = true;
compiler->shader_perf_log(log_data,
"SIMD16 shader failed to compile: %s",
v16.fail_msg);
} else {
+ simd16_spilled = v16.spilled_any_registers;
simd16_cfg = v16.cfg;
prog_data->dispatch_grf_start_reg_16 = v16.payload.num_regs;
prog_data->reg_blocks_16 = brw_register_blocks(v16.grf_used);
@@ -7619,9 +7684,17 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
}
/* Currently, the compiler only supports SIMD32 on SNB+ */
+ const brw_simd32_heuristics_control *ctrl = &compiler->simd32_heuristics_control;
+ uint64_t mrts = shader->info.outputs_written << FRAG_RESULT_DATA0;
+
if (v8.max_dispatch_width >= 32 && !use_rep_send &&
compiler->devinfo->gen >= 6 &&
- unlikely(INTEL_DEBUG & DEBUG_DO32)) {
+ (unlikely(INTEL_DEBUG & DEBUG_DO32) ||
+ (unlikely(INTEL_DEBUG & DEBUG_HEUR32) &&
+ !simd16_failed && !simd16_spilled &&
+ (!ctrl->mrt_check ||
+ (ctrl->mrt_check &&
+ u_count_bits64(&mrts) <= ctrl->max_mrts))))) {
/* Try a SIMD32 compile */
fs_visitor v32(compiler, log_data, mem_ctx, key,
&prog_data->base, prog, shader, 32,
@@ -7632,9 +7705,12 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
"SIMD32 shader failed to compile: %s",
v32.fail_msg);
} else {
- simd32_cfg = v32.cfg;
- prog_data->dispatch_grf_start_reg_32 = v32.payload.num_regs;
- prog_data->reg_blocks_32 = brw_register_blocks(v32.grf_used);
+ if (likely(!(INTEL_DEBUG & DEBUG_HEUR32)) ||
+ v32.run_heuristic(ctrl)) {
+ simd32_cfg = v32.cfg;
+ prog_data->dispatch_grf_start_reg_32 = v32.payload.num_regs;
+ prog_data->reg_blocks_32 = brw_register_blocks(v32.grf_used);
+ }
}
}
@@ -7713,13 +7789,49 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
}
if (simd32_cfg) {
- prog_data->dispatch_32 = true;
- prog_data->prog_offset_32 = g.generate_code(simd32_cfg, 32);
+ uint32_t offset = g.generate_code(simd32_cfg, 32);
+
+ if (unlikely(INTEL_DEBUG & DEBUG_DO32) ||
+ (unlikely(INTEL_DEBUG & DEBUG_HEUR32) &&
+ (!simd16_cfg ||
+ (simd16_cfg &&
+ (!ctrl->inst_count_check ||
+ (ctrl->inst_count_check &&
+ (float)g.get_inst_count(32) / (float)g.get_inst_count(16) <= ctrl->inst_count_ratio)))))) {
+ prog_data->dispatch_32 = true;
+ prog_data->prog_offset_32 = offset;
+ }
}
return g.get_assembly();
}
+bool
+fs_visitor::run_heuristic(const struct brw_simd32_heuristics_control *ctrl) {
+ int grouped_sends = 0;
+ int max_grouped_sends = 0;
+ bool pass = true;
+
+ foreach_block_and_inst(block, fs_inst, inst, cfg) {
+ if (inst->opcode >= SHADER_OPCODE_TEX && inst->opcode <= SHADER_OPCODE_SAMPLEINFO_LOGICAL) {
+ ++grouped_sends;
+ } else if (grouped_sends > 0) {
+ if (grouped_sends > max_grouped_sends) {
+ max_grouped_sends = grouped_sends;
+ }
+ grouped_sends = 0;
+ }
+ }
+
+ if (ctrl->grouped_sends_check) {
+ if (max_grouped_sends > ctrl->max_grouped_sends) {
+ pass = false;
+ }
+ }
+
+ return pass;
+}
+
fs_reg *
fs_visitor::emit_cs_work_group_id_setup()
{
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index 5361b768003..72acf85581e 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -289,6 +289,8 @@ class fs_visitor : public backend_shader
void dump_instruction(backend_instruction *inst);
void dump_instruction(backend_instruction *inst, FILE *file);
+ bool run_heuristic(const struct brw_simd32_heuristics_control *ctrl);
+
const void *const key;
const struct brw_sampler_prog_key_data *key_tex;
@@ -400,6 +402,7 @@ class fs_generator
void enable_debug(const char *shader_name);
int generate_code(const cfg_t *cfg, int dispatch_width);
+ int get_inst_count(int dispatch_width);
const unsigned *get_assembly();
private:
@@ -495,6 +498,7 @@ class fs_generator
struct brw_stage_prog_data * const prog_data;
unsigned dispatch_width; /**< 8, 16 or 32 */
+ int inst_count[3]; /* for 8, 16 and 32 */
exec_list discard_halt_patches;
unsigned promoted_constants;
diff --git a/src/intel/compiler/brw_fs_cmod_propagation.cpp b/src/intel/compiler/brw_fs_cmod_propagation.cpp
index 5fb522f810f..b58730fbbe5 100644
--- a/src/intel/compiler/brw_fs_cmod_propagation.cpp
+++ b/src/intel/compiler/brw_fs_cmod_propagation.cpp
@@ -255,6 +255,13 @@ opt_cmod_propagation_local(const gen_device_info *devinfo, bblock_t *block)
if (inst->opcode == BRW_OPCODE_AND)
break;
+ /* Not safe to use inequality operators if the types are different
+ */
+ if (scan_inst->dst.type != inst->src[0].type &&
+ inst->conditional_mod != BRW_CONDITIONAL_Z &&
+ inst->conditional_mod != BRW_CONDITIONAL_NZ)
+ break;
+
/* Comparisons operate differently for ints and floats */
if (scan_inst->dst.type != inst->dst.type &&
(scan_inst->dst.type == BRW_REGISTER_TYPE_F ||
diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp
index e3b68fa3165..82c2713a77f 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -90,9 +90,16 @@ brw_reg_from_fs_reg(const struct gen_device_info *devinfo, fs_inst *inst,
* different execution size when the number of components
* written to each destination GRF is not the same.
*/
- const unsigned width = MIN2(reg_width, phys_width);
- brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0);
- brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride);
+ if (reg->stride > 4) {
+ assert(reg != &inst->dst);
+ assert(reg->stride * type_sz(reg->type) <= REG_SIZE);
+ brw_reg = brw_vecn_reg(1, brw_file_from_reg(reg), reg->nr, 0);
+ brw_reg = stride(brw_reg, reg->stride, 1, 0);
+ } else {
+ const unsigned width = MIN2(reg_width, phys_width);
+ brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0);
+ brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride);
+ }
if (devinfo->gen == 7 && !devinfo->is_haswell) {
/* From the IvyBridge PRM (EU Changes by Processor Generation, page 13):
@@ -2093,6 +2100,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
break;
case SHADER_OPCODE_INTERLOCK:
+ assert(devinfo->gen >= 9);
/* The interlock is basically a memory fence issued via sendc */
brw_memory_fence(p, dst, BRW_OPCODE_SENDC);
break;
@@ -2289,6 +2297,8 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
fill_count, promoted_constants, before_size,
after_size);
+ inst_count[ffs(dispatch_width) - 4] = before_size / 16;
+
return start_offset;
}
@@ -2297,3 +2307,13 @@ fs_generator::get_assembly()
{
return brw_get_program(p, &prog_data->program_size);
}
+
+int
+fs_generator::get_inst_count(int dispatch_width)
+{
+ if (dispatch_width == 8 || dispatch_width == 16 || dispatch_width == 32) {
+ return inst_count[ffs(dispatch_width) - 4];
+ } else {
+ return 0;
+ }
+}
\ No newline at end of file
diff --git a/src/intel/compiler/brw_fs_lower_regioning.cpp b/src/intel/compiler/brw_fs_lower_regioning.cpp
index df50993dee6..c60d4700419 100644
--- a/src/intel/compiler/brw_fs_lower_regioning.cpp
+++ b/src/intel/compiler/brw_fs_lower_regioning.cpp
@@ -71,15 +71,33 @@ namespace {
!is_byte_raw_mov(inst)) {
return get_exec_type_size(inst);
} else {
- unsigned stride = inst->dst.stride * type_sz(inst->dst.type);
+ /* Calculate the maximum byte stride and the minimum/maximum type
+ * size across all source and destination operands we are required to
+ * lower.
+ */
+ unsigned max_stride = inst->dst.stride * type_sz(inst->dst.type);
+ unsigned min_size = type_sz(inst->dst.type);
+ unsigned max_size = type_sz(inst->dst.type);
for (unsigned i = 0; i < inst->sources; i++) {
- if (!is_uniform(inst->src[i]))
- stride = MAX2(stride, inst->src[i].stride *
- type_sz(inst->src[i].type));
+ if (!is_uniform(inst->src[i]) && !inst->is_control_source(i)) {
+ const unsigned size = type_sz(inst->src[i].type);
+ max_stride = MAX2(max_stride, inst->src[i].stride * size);
+ min_size = MIN2(min_size, size);
+ max_size = MAX2(max_size, size);
+ }
}
- return stride;
+ /* All operands involved in lowering need to fit in the calculated
+ * stride.
+ */
+ assert(max_size <= 4 * min_size);
+
+ /* Attempt to use the largest byte stride among all present operands,
+ * but never exceed a stride of 4 since that would lead to illegal
+ * destination regions during lowering.
+ */
+ return MIN2(max_stride, 4 * min_size);
}
}
@@ -92,7 +110,7 @@ namespace {
required_dst_byte_offset(const fs_inst *inst)
{
for (unsigned i = 0; i < inst->sources; i++) {
- if (!is_uniform(inst->src[i]))
+ if (!is_uniform(inst->src[i]) && !inst->is_control_source(i))
if (reg_offset(inst->src[i]) % REG_SIZE !=
reg_offset(inst->dst) % REG_SIZE)
return 0;
@@ -109,7 +127,7 @@ namespace {
has_invalid_src_region(const gen_device_info *devinfo, const fs_inst *inst,
unsigned i)
{
- if (is_unordered(inst)) {
+ if (is_unordered(inst) || inst->is_control_source(i)) {
return false;
} else {
const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type);
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index f16627b8a64..6f0d9731cfe 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -512,6 +512,15 @@ fs_visitor::optimize_extract_to_float(nir_alu_instr *instr,
src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16)
return false;
+ /* If either opcode has source modifiers, bail.
+ *
+ * TODO: We can potentially handle source modifiers if both of the opcodes
+ * we're combining are signed integers.
+ */
+ if (instr->src[0].abs || instr->src[0].negate ||
+ src0->src[0].abs || src0->src[0].negate)
+ return false;
+
unsigned element = nir_src_as_uint(src0->src[1].src);
/* Element type to extract.*/
@@ -1484,16 +1493,25 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
* Use two instructions and a word or DWord intermediate integer type.
*/
if (nir_dest_bit_size(instr->dest.dest) == 64) {
- const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i8);
+ const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
if (instr->op == nir_op_extract_i8) {
/* If we need to sign extend, extract to a word first */
fs_reg w_temp = bld.vgrf(BRW_REGISTER_TYPE_W);
bld.MOV(w_temp, subscript(op[0], type, byte));
bld.MOV(result, w_temp);
+ } else if (byte & 1) {
+ /* Extract the high byte from the word containing the desired byte
+ * offset.
+ */
+ bld.SHR(result,
+ subscript(op[0], BRW_REGISTER_TYPE_UW, byte / 2),
+ brw_imm_uw(8));
} else {
/* Otherwise use an AND with 0xff and a word type */
- bld.AND(result, subscript(op[0], type, byte / 2), brw_imm_uw(0xff));
+ bld.AND(result,
+ subscript(op[0], BRW_REGISTER_TYPE_UW, byte / 2),
+ brw_imm_uw(0xff));
}
} else {
const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
diff --git a/src/intel/compiler/brw_fs_reg_allocate.cpp b/src/intel/compiler/brw_fs_reg_allocate.cpp
index 6961cb1caf4..6e18bdfe68a 100644
--- a/src/intel/compiler/brw_fs_reg_allocate.cpp
+++ b/src/intel/compiler/brw_fs_reg_allocate.cpp
@@ -591,7 +591,7 @@ fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
*/
foreach_block_and_inst(block, fs_inst, inst, cfg) {
if (inst->dst.file == VGRF && inst->has_source_and_destination_hazard()) {
- for (unsigned i = 0; i < 3; i++) {
+ for (unsigned i = 0; i < inst->sources; i++) {
if (inst->src[i].file == VGRF) {
ra_add_node_interference(g, inst->dst.nr, inst->src[i].nr);
}
@@ -667,15 +667,14 @@ fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
* messages adding a node interference to the grf127_send_hack_node.
* This node has a fixed asignment to grf127.
*
- * We don't apply it to SIMD16 because previous code avoids any register
- * overlap between sources and destination.
+ * We don't apply it to SIMD16 instructions because previous code avoids
+ * any register overlap between sources and destination.
*/
ra_set_node_reg(g, grf127_send_hack_node, 127);
- if (dispatch_width == 8) {
- foreach_block_and_inst(block, fs_inst, inst, cfg) {
- if (inst->is_send_from_grf() && inst->dst.file == VGRF)
- ra_add_node_interference(g, inst->dst.nr, grf127_send_hack_node);
- }
+ foreach_block_and_inst(block, fs_inst, inst, cfg) {
+ if (inst->exec_size < 16 && inst->is_send_from_grf() &&
+ inst->dst.file == VGRF)
+ ra_add_node_interference(g, inst->dst.nr, grf127_send_hack_node);
}
if (spilled_any_registers) {
@@ -711,14 +710,9 @@ fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 &&
inst->src[2].file == VGRF &&
inst->src[3].file == VGRF &&
- inst->src[2].nr != inst->src[3].nr) {
- for (unsigned i = 0; i < inst->mlen; i++) {
- for (unsigned j = 0; j < inst->ex_mlen; j++) {
- ra_add_node_interference(g, inst->src[2].nr + i,
- inst->src[3].nr + j);
- }
- }
- }
+ inst->src[2].nr != inst->src[3].nr)
+ ra_add_node_interference(g, inst->src[2].nr,
+ inst->src[3].nr);
}
}
diff --git a/src/intel/compiler/brw_ir_fs.h b/src/intel/compiler/brw_ir_fs.h
index d05357e822e..c4427a658b0 100644
--- a/src/intel/compiler/brw_ir_fs.h
+++ b/src/intel/compiler/brw_ir_fs.h
@@ -357,6 +357,13 @@ class fs_inst : public backend_instruction {
bool can_change_types() const;
bool has_source_and_destination_hazard() const;
+ /**
+ * Return whether \p arg is a control source of a virtual instruction which
+ * shouldn't contribute to the execution type and usual regioning
+ * restriction calculations of arithmetic instructions.
+ */
+ bool is_control_source(unsigned arg) const;
+
/**
* Return the subset of flag registers read by the instruction as a bitset
* with byte granularity.
@@ -461,7 +468,8 @@ get_exec_type(const fs_inst *inst)
brw_reg_type exec_type = BRW_REGISTER_TYPE_B;
for (int i = 0; i < inst->sources; i++) {
- if (inst->src[i].file != BAD_FILE) {
+ if (inst->src[i].file != BAD_FILE &&
+ !inst->is_control_source(i)) {
const brw_reg_type t = get_exec_type(inst->src[i].type);
if (type_sz(t) > type_sz(exec_type))
exec_type = t;
diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
index 9dbf06004a4..30c3f19fb4a 100644
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -570,18 +570,7 @@ brw_nir_optimize(nir_shader *nir, const struct brw_compiler *compiler,
OPT(nir_opt_dce);
OPT(nir_opt_cse);
- /* Passing 0 to the peephole select pass causes it to convert
- * if-statements that contain only move instructions in the branches
- * regardless of the count.
- *
- * Passing 1 to the peephole select pass causes it to convert
- * if-statements that contain at most a single ALU instruction (total)
- * in both branches. Before Gen6, some math instructions were
- * prohibitively expensive and the results of compare operations need an
- * extra resolve step. For these reasons, this pass is more harmful
- * than good on those platforms.
- *
- * For indirect loads of uniforms (push constants), we assume that array
+ /* For indirect loads of uniforms (push constants), we assume that array
* indices will nearly always be in bounds and the cost of the load is
* low. Therefore there shouldn't be a performance benefit to avoid it.
* However, in vec4 tessellation shaders, these loads operate by
@@ -590,9 +579,7 @@ brw_nir_optimize(nir_shader *nir, const struct brw_compiler *compiler,
const bool is_vec4_tessellation = !is_scalar &&
(nir->info.stage == MESA_SHADER_TESS_CTRL ||
nir->info.stage == MESA_SHADER_TESS_EVAL);
- OPT(nir_opt_peephole_select, 0, !is_vec4_tessellation, false);
- OPT(nir_opt_peephole_select, 1, !is_vec4_tessellation,
- compiler->devinfo->gen >= 6);
+ OPT(nir_opt_peephole_select, 0, !is_vec4_tessellation);
OPT(nir_opt_intrinsics);
OPT(nir_opt_idiv_const, 32);
@@ -794,6 +781,17 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir)
OPT(brw_nir_lower_mem_access_bit_sizes);
+ /* Lower array derefs of vectors for SSBO and UBO loads. For both UBOs and
+ * SSBOs, our back-end is capable of loading an entire vec4 at a time and
+ * we would like to take advantage of that whenever possible regardless of
+ * whether or not the app gives us full loads. This should allow the
+ * optimizer to combine UBO and SSBO load operations and save us some send
+ * messages.
+ */
+ OPT(nir_lower_array_deref_of_vec,
+ nir_var_mem_ubo | nir_var_mem_ssbo,
+ nir_lower_direct_array_deref_of_vec_load);
+
/* Get rid of split copies */
nir = brw_nir_optimize(nir, compiler, is_scalar, false);
@@ -842,6 +840,23 @@ brw_nir_link_shaders(const struct brw_compiler *compiler,
*producer = brw_nir_optimize(*producer, compiler, p_is_scalar, false);
*consumer = brw_nir_optimize(*consumer, compiler, c_is_scalar, false);
}
+
+ NIR_PASS_V(*producer, nir_lower_io_to_vector, nir_var_shader_out);
+ NIR_PASS_V(*consumer, nir_lower_io_to_vector, nir_var_shader_in);
+
+ if ((*producer)->info.stage != MESA_SHADER_TESS_CTRL) {
+ /* Calling lower_io_to_vector creates output variable writes with
+ * write-masks. On non-TCS outputs, the back-end can't handle it and we
+ * need to call nir_lower_io_to_temporaries to get rid of them. This,
+ * in turn, creates temporary variables and extra copy_deref intrinsics
+ * that we need to clean up.
+ */
+ NIR_PASS_V(*producer, nir_lower_io_to_temporaries,
+ nir_shader_get_entrypoint(*producer), true, false);
+ NIR_PASS_V(*producer, nir_lower_global_vars_to_local);
+ NIR_PASS_V(*producer, nir_split_var_copies);
+ NIR_PASS_V(*producer, nir_lower_var_copies);
+ }
}
/* Prepare the given shader for codegen
@@ -932,7 +947,9 @@ brw_nir_apply_sampler_key(nir_shader *nir,
bool is_scalar)
{
const struct gen_device_info *devinfo = compiler->devinfo;
- nir_lower_tex_options tex_options = { 0 };
+ nir_lower_tex_options tex_options = {
+ .lower_txd_clamp_if_sampler_index_not_lt_16 = true,
+ };
/* Iron Lake and prior require lowering of all rectangle textures */
if (devinfo->gen < 6)
@@ -964,6 +981,10 @@ brw_nir_apply_sampler_key(nir_shader *nir,
tex_options.lower_xy_uxvx_external = key_tex->xy_uxvx_image_mask;
tex_options.lower_ayuv_external = key_tex->ayuv_image_mask;
+ /* Setup array of scaling factors for each texture. */
+ memcpy(&tex_options.scale_factors, &key_tex->scale_factors,
+ sizeof(tex_options.scale_factors));
+
if (nir_lower_tex(nir, &tex_options)) {
nir_validate_shader(nir, "after nir_lower_tex");
nir = brw_nir_optimize(nir, compiler, is_scalar, false);
diff --git a/src/intel/compiler/brw_vec4.cpp b/src/intel/compiler/brw_vec4.cpp
index 4489c682d01..785508f1e3f 100644
--- a/src/intel/compiler/brw_vec4.cpp
+++ b/src/intel/compiler/brw_vec4.cpp
@@ -1160,6 +1160,12 @@ vec4_instruction::can_reswizzle(const struct gen_device_info *devinfo,
if (devinfo->gen == 6 && is_math() && swizzle != BRW_SWIZZLE_XYZW)
return false;
+ /* If we write to the flag register changing the swizzle would change
+ * what channels are written to the flag register.
+ */
+ if (writes_flag())
+ return false;
+
/* We can't swizzle implicit accumulator access. We'd have to
* reswizzle the producer of the accumulator value in addition
* to the consumer (i.e. both MUL and MACH). Just skip this.
diff --git a/src/intel/compiler/brw_vec4_cmod_propagation.cpp b/src/intel/compiler/brw_vec4_cmod_propagation.cpp
index 760327d559d..a7a3bb8fb06 100644
--- a/src/intel/compiler/brw_vec4_cmod_propagation.cpp
+++ b/src/intel/compiler/brw_vec4_cmod_propagation.cpp
@@ -173,19 +173,19 @@ opt_cmod_propagation_local(bblock_t *block, vec4_visitor *v)
/* Given a sequence like:
*
- * cmp.ge.f0(8) g21<1>.xF g20<4>.xF g18<4>.xF
+ * cmp.ge.f0(8) g21<1>.zF g20<4>.xF g18<4>.xF
* ...
- * cmp.nz.f0(8) null<1>D g21<4>.xD 0D
+ * cmp.nz.f0(8) null<1>D g21<4>.zD 0D
*
* Replace it with something like:
*
- * cmp.ge.f0(8) g22<1>F g20<4>.xF g18<4>.xF
- * mov(8) g21<1>.xF g22<1>.xxxxF
+ * cmp.ge.f0(8) g22<1>.zF g20<4>.xF g18<4>.xF
+ * mov(8) g21<1>.xF g22<1>.zzzzF
*
* The added MOV will most likely be removed later. In the
* worst case, it should be cheaper to schedule.
*/
- temp.swizzle = inst->src[0].swizzle;
+ temp.swizzle = brw_swizzle_for_mask(inst->dst.writemask);
temp.type = scan_inst->src[0].type;
vec4_instruction *mov = v->MOV(scan_inst->dst, temp);
diff --git a/src/intel/compiler/test_fs_cmod_propagation.cpp b/src/intel/compiler/test_fs_cmod_propagation.cpp
index 659fbb2d1bc..4215af1fb02 100644
--- a/src/intel/compiler/test_fs_cmod_propagation.cpp
+++ b/src/intel/compiler/test_fs_cmod_propagation.cpp
@@ -889,3 +889,35 @@ TEST_F(cmod_propagation_test, subtract_delete_compare_derp)
EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 1)->opcode);
EXPECT_EQ(BRW_PREDICATE_NORMAL, instruction(block0, 1)->predicate);
}
+
+TEST_F(cmod_propagation_test, signed_unsigned_comparison_mismatch)
+{
+ const fs_builder &bld = v->bld;
+ fs_reg dest0 = v->vgrf(glsl_type::int_type);
+ fs_reg src0 = v->vgrf(glsl_type::int_type);
+ src0.type = BRW_REGISTER_TYPE_W;
+
+ bld.ASR(dest0, negate(src0), brw_imm_d(15));
+ bld.CMP(bld.null_reg_ud(), retype(dest0, BRW_REGISTER_TYPE_UD),
+ brw_imm_ud(0u), BRW_CONDITIONAL_LE);
+
+ /* = Before =
+ * 0: asr(8) dest:D -src0:W 15D
+ * 1: cmp.le.f0(8) null:UD dest:UD 0UD
+ *
+ * = After =
+ * (no changes)
+ */
+ v->calculate_cfg();
+ bblock_t *block0 = v->cfg->blocks[0];
+
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+
+ EXPECT_FALSE(cmod_propagation(v));
+ EXPECT_EQ(0, block0->start_ip);
+ EXPECT_EQ(1, block0->end_ip);
+ EXPECT_EQ(BRW_OPCODE_ASR, instruction(block0, 0)->opcode);
+ EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+ EXPECT_EQ(BRW_CONDITIONAL_LE, instruction(block0, 1)->conditional_mod);
+}
diff --git a/src/intel/dev/gen_device_info.c b/src/intel/dev/gen_device_info.c
index 5dbd0607572..625ebc031dc 100644
--- a/src/intel/dev/gen_device_info.c
+++ b/src/intel/dev/gen_device_info.c
@@ -414,6 +414,7 @@ static const struct gen_device_info gen_device_info_hsw_gt3 = {
.has_64bit_types = true, \
.supports_simd16_3src = true, \
.has_surface_tile_offset = true, \
+ .num_thread_per_eu = 7, \
.max_vs_threads = 504, \
.max_tcs_threads = 504, \
.max_tes_threads = 504, \
@@ -427,7 +428,6 @@ static const struct gen_device_info gen_device_info_bdw_gt1 = {
.num_slices = 1,
.num_subslices = { 2, },
.num_eu_per_subslice = 8,
- .num_thread_per_eu = 7,
.l3_banks = 2,
.max_cs_threads = 42,
.urb = {
@@ -452,7 +452,6 @@ static const struct gen_device_info gen_device_info_bdw_gt2 = {
.num_slices = 1,
.num_subslices = { 3, },
.num_eu_per_subslice = 8,
- .num_thread_per_eu = 7,
.l3_banks = 4,
.max_cs_threads = 56,
.urb = {
@@ -477,7 +476,6 @@ static const struct gen_device_info gen_device_info_bdw_gt3 = {
.num_slices = 2,
.num_subslices = { 3, 3, },
.num_eu_per_subslice = 8,
- .num_thread_per_eu = 7,
.l3_banks = 8,
.max_cs_threads = 56,
.urb = {
@@ -503,7 +501,6 @@ static const struct gen_device_info gen_device_info_chv = {
.num_slices = 1,
.num_subslices = { 2, },
.num_eu_per_subslice = 8,
- .num_thread_per_eu = 7,
.l3_banks = 2,
.max_vs_threads = 80,
.max_tcs_threads = 80,
@@ -609,8 +606,7 @@ static const struct gen_device_info gen_device_info_chv = {
#define GEN9_FEATURES \
GEN8_FEATURES, \
GEN9_HW_INFO, \
- .has_sample_with_hiz = true, \
- .num_thread_per_eu = 7
+ .has_sample_with_hiz = true
static const struct gen_device_info gen_device_info_skl_gt1 = {
GEN9_FEATURES, .gt = 1,
@@ -777,6 +773,7 @@ static const struct gen_device_info gen_device_info_cfl_gt1 = {
.num_subslices = { 2, },
.num_eu_per_subslice = 6,
.l3_banks = 2,
+ .urb.size = 192,
.simulator_id = 24,
};
static const struct gen_device_info gen_device_info_cfl_gt2 = {
diff --git a/src/intel/genxml/gen10.xml b/src/intel/genxml/gen10.xml
index 284633aedd4..4cb1f05ae25 100644
--- a/src/intel/genxml/gen10.xml
+++ b/src/intel/genxml/gen10.xml
@@ -2043,7 +2043,10 @@
-
+
+
+
+
diff --git a/src/intel/genxml/gen11.xml b/src/intel/genxml/gen11.xml
index 95a84a2f597..a7c06c5ab60 100644
--- a/src/intel/genxml/gen11.xml
+++ b/src/intel/genxml/gen11.xml
@@ -2063,7 +2063,10 @@
-
+
+
+
+
diff --git a/src/intel/genxml/gen7.xml b/src/intel/genxml/gen7.xml
index 363fd8664bf..1b2c7d996f9 100644
--- a/src/intel/genxml/gen7.xml
+++ b/src/intel/genxml/gen7.xml
@@ -1399,7 +1399,10 @@
-
+
+
+
+
diff --git a/src/intel/genxml/gen75.xml b/src/intel/genxml/gen75.xml
index a1da9cae041..95b306139eb 100644
--- a/src/intel/genxml/gen75.xml
+++ b/src/intel/genxml/gen75.xml
@@ -1713,7 +1713,10 @@
-
+
+
+
+
diff --git a/src/intel/genxml/gen8.xml b/src/intel/genxml/gen8.xml
index 4676d9bca9c..0226d7c0c66 100644
--- a/src/intel/genxml/gen8.xml
+++ b/src/intel/genxml/gen8.xml
@@ -1816,7 +1816,10 @@
-
+
+
+
+
diff --git a/src/intel/genxml/gen9.xml b/src/intel/genxml/gen9.xml
index 8afa986df55..88fc2da7885 100644
--- a/src/intel/genxml/gen9.xml
+++ b/src/intel/genxml/gen9.xml
@@ -1995,7 +1995,10 @@
-
+
+
+
+
diff --git a/src/intel/genxml/meson.build b/src/intel/genxml/meson.build
index d0c982d0f8b..343b4fcc45f 100644
--- a/src/intel/genxml/meson.build
+++ b/src/intel/genxml/meson.build
@@ -57,3 +57,5 @@ foreach f : gen_xml_files
capture : true,
)
endforeach
+
+idep_genxml = declare_dependency(sources : [gen_xml_pack, genX_bits_h, genX_xml_h])
diff --git a/src/intel/meson.build b/src/intel/meson.build
index 3c57e79d325..a5bb03e314a 100644
--- a/src/intel/meson.build
+++ b/src/intel/meson.build
@@ -21,9 +21,9 @@
c_sse2_args = ['-msse2', '-mstackrealign']
inc_intel = include_directories('.')
+subdir('genxml')
subdir('blorp')
subdir('dev')
-subdir('genxml')
subdir('isl')
subdir('common')
subdir('compiler')
diff --git a/src/intel/vulkan/anv_allocator.c b/src/intel/vulkan/anv_allocator.c
index 006175c8c65..e9cc5764924 100644
--- a/src/intel/vulkan/anv_allocator.c
+++ b/src/intel/vulkan/anv_allocator.c
@@ -165,7 +165,7 @@ anv_state_table_init(struct anv_state_table *table,
goto fail_fd;
}
- if (!u_vector_init(&table->mmap_cleanups,
+ if (!u_vector_init(&table->cleanups,
round_to_power_of_two(sizeof(struct anv_state_table_cleanup)),
128)) {
result = vk_error(VK_ERROR_INITIALIZATION_FAILED);
@@ -179,12 +179,12 @@ anv_state_table_init(struct anv_state_table *table,
uint32_t initial_size = initial_entries * ANV_STATE_ENTRY_SIZE;
result = anv_state_table_expand_range(table, initial_size);
if (result != VK_SUCCESS)
- goto fail_mmap_cleanups;
+ goto fail_cleanups;
return VK_SUCCESS;
- fail_mmap_cleanups:
- u_vector_finish(&table->mmap_cleanups);
+ fail_cleanups:
+ u_vector_finish(&table->cleanups);
fail_fd:
close(table->fd);
@@ -195,7 +195,7 @@ static VkResult
anv_state_table_expand_range(struct anv_state_table *table, uint32_t size)
{
void *map;
- struct anv_mmap_cleanup *cleanup;
+ struct anv_state_table_cleanup *cleanup;
/* Assert that we only ever grow the pool */
assert(size >= table->state.end);
@@ -204,11 +204,11 @@ anv_state_table_expand_range(struct anv_state_table *table, uint32_t size)
if (size > BLOCK_POOL_MEMFD_SIZE)
return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
- cleanup = u_vector_add(&table->mmap_cleanups);
+ cleanup = u_vector_add(&table->cleanups);
if (!cleanup)
return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
- *cleanup = ANV_MMAP_CLEANUP_INIT;
+ *cleanup = ANV_STATE_TABLE_CLEANUP_INIT;
/* Just leak the old map until we destroy the pool. We can't munmap it
* without races or imposing locking on the block allocate fast path. On
@@ -272,12 +272,12 @@ anv_state_table_finish(struct anv_state_table *table)
{
struct anv_state_table_cleanup *cleanup;
- u_vector_foreach(cleanup, &table->mmap_cleanups) {
+ u_vector_foreach(cleanup, &table->cleanups) {
if (cleanup->map)
munmap(cleanup->map, cleanup->size);
}
- u_vector_finish(&table->mmap_cleanups);
+ u_vector_finish(&table->cleanups);
close(table->fd);
}
diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c
index 53303e0e745..60d332c33b6 100644
--- a/src/intel/vulkan/anv_cmd_buffer.c
+++ b/src/intel/vulkan/anv_cmd_buffer.c
@@ -128,8 +128,13 @@ static void
anv_cmd_pipeline_state_finish(struct anv_cmd_buffer *cmd_buffer,
struct anv_cmd_pipeline_state *pipe_state)
{
- for (uint32_t i = 0; i < ARRAY_SIZE(pipe_state->push_descriptors); i++)
- vk_free(&cmd_buffer->pool->alloc, pipe_state->push_descriptors[i]);
+ for (uint32_t i = 0; i < ARRAY_SIZE(pipe_state->push_descriptors); i++) {
+ if (pipe_state->push_descriptors[i]) {
+ anv_descriptor_set_layout_unref(cmd_buffer->device,
+ pipe_state->push_descriptors[i]->set.layout);
+ vk_free(&cmd_buffer->pool->alloc, pipe_state->push_descriptors[i]);
+ }
+ }
}
static void
@@ -957,10 +962,11 @@ anv_cmd_buffer_get_depth_stencil_view(const struct anv_cmd_buffer *cmd_buffer)
return iview;
}
-static struct anv_push_descriptor_set *
-anv_cmd_buffer_get_push_descriptor_set(struct anv_cmd_buffer *cmd_buffer,
- VkPipelineBindPoint bind_point,
- uint32_t set)
+static struct anv_descriptor_set *
+anv_cmd_buffer_push_descriptor_set(struct anv_cmd_buffer *cmd_buffer,
+ VkPipelineBindPoint bind_point,
+ struct anv_descriptor_set_layout *layout,
+ uint32_t _set)
{
struct anv_cmd_pipeline_state *pipe_state;
if (bind_point == VK_PIPELINE_BIND_POINT_COMPUTE) {
@@ -971,19 +977,31 @@ anv_cmd_buffer_get_push_descriptor_set(struct anv_cmd_buffer *cmd_buffer,
}
struct anv_push_descriptor_set **push_set =
- &pipe_state->push_descriptors[set];
+ &pipe_state->push_descriptors[_set];
if (*push_set == NULL) {
- *push_set = vk_alloc(&cmd_buffer->pool->alloc,
- sizeof(struct anv_push_descriptor_set), 8,
- VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+ *push_set = vk_zalloc(&cmd_buffer->pool->alloc,
+ sizeof(struct anv_push_descriptor_set), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (*push_set == NULL) {
anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY);
return NULL;
}
}
- return *push_set;
+ struct anv_descriptor_set *set = &(*push_set)->set;
+
+ if (set->layout != layout) {
+ if (set->layout)
+ anv_descriptor_set_layout_unref(cmd_buffer->device, set->layout);
+ anv_descriptor_set_layout_ref(layout);
+ set->layout = layout;
+ }
+ set->size = anv_descriptor_set_layout_size(layout);
+ set->buffer_count = layout->buffer_count;
+ set->buffer_views = (*push_set)->buffer_views;
+
+ return set;
}
void anv_CmdPushDescriptorSetKHR(
@@ -1001,19 +1019,12 @@ void anv_CmdPushDescriptorSetKHR(
struct anv_descriptor_set_layout *set_layout = layout->set[_set].layout;
- struct anv_push_descriptor_set *push_set =
- anv_cmd_buffer_get_push_descriptor_set(cmd_buffer,
- pipelineBindPoint, _set);
- if (!push_set)
+ struct anv_descriptor_set *set =
+ anv_cmd_buffer_push_descriptor_set(cmd_buffer, pipelineBindPoint,
+ set_layout, _set);
+ if (!set)
return;
- struct anv_descriptor_set *set = &push_set->set;
-
- set->layout = set_layout;
- set->size = anv_descriptor_set_layout_size(set_layout);
- set->buffer_count = set_layout->buffer_count;
- set->buffer_views = push_set->buffer_views;
-
/* Go through the user supplied descriptors. */
for (uint32_t i = 0; i < descriptorWriteCount; i++) {
const VkWriteDescriptorSet *write = &pDescriptorWrites[i];
@@ -1093,19 +1104,12 @@ void anv_CmdPushDescriptorSetWithTemplateKHR(
struct anv_descriptor_set_layout *set_layout = layout->set[_set].layout;
- struct anv_push_descriptor_set *push_set =
- anv_cmd_buffer_get_push_descriptor_set(cmd_buffer,
- template->bind_point, _set);
- if (!push_set)
+ struct anv_descriptor_set *set =
+ anv_cmd_buffer_push_descriptor_set(cmd_buffer, template->bind_point,
+ set_layout, _set);
+ if (!set)
return;
- struct anv_descriptor_set *set = &push_set->set;
-
- set->layout = set_layout;
- set->size = anv_descriptor_set_layout_size(set_layout);
- set->buffer_count = set_layout->buffer_count;
- set->buffer_views = push_set->buffer_views;
-
anv_descriptor_set_write_template(set,
cmd_buffer->device,
&cmd_buffer->surface_state_stream,
diff --git a/src/intel/vulkan/anv_descriptor_set.c b/src/intel/vulkan/anv_descriptor_set.c
index a4e466cf3dd..0259abea0bf 100644
--- a/src/intel/vulkan/anv_descriptor_set.c
+++ b/src/intel/vulkan/anv_descriptor_set.c
@@ -58,6 +58,9 @@ void anv_GetDescriptorSetLayoutSupport(
anv_foreach_stage(s, binding->stageFlags)
surface_count[s] += sampler->n_planes;
}
+ } else {
+ anv_foreach_stage(s, binding->stageFlags)
+ surface_count[s] += binding->descriptorCount;
}
break;
@@ -70,10 +73,10 @@ void anv_GetDescriptorSetLayoutSupport(
bool supported = true;
for (unsigned s = 0; s < MESA_SHADER_STAGES; s++) {
- /* Our maximum binding table size is 250 and we need to reserve 8 for
- * render targets. 240 is a nice round number.
+ /* Our maximum binding table size is 240 and we need to reserve 8 for
+ * render targets.
*/
- if (surface_count[s] >= 240)
+ if (surface_count[s] >= MAX_BINDING_TABLE_SIZE - MAX_RTS)
supported = false;
}
@@ -458,6 +461,8 @@ VkResult anv_CreateDescriptorPool(
&device->surface_state_pool, 4096);
pool->surface_state_free_list = NULL;
+ list_inithead(&pool->desc_sets);
+
*pDescriptorPool = anv_descriptor_pool_to_handle(pool);
return VK_SUCCESS;
@@ -474,7 +479,13 @@ void anv_DestroyDescriptorPool(
if (!pool)
return;
+ list_for_each_entry_safe(struct anv_descriptor_set, set,
+ &pool->desc_sets, pool_link) {
+ anv_descriptor_set_destroy(device, pool, set);
+ }
+
anv_state_stream_finish(&pool->surface_state_stream);
+
vk_free2(&device->alloc, pAllocator, pool);
}
@@ -486,6 +497,11 @@ VkResult anv_ResetDescriptorPool(
ANV_FROM_HANDLE(anv_device, device, _device);
ANV_FROM_HANDLE(anv_descriptor_pool, pool, descriptorPool);
+ list_for_each_entry_safe(struct anv_descriptor_set, set,
+ &pool->desc_sets, pool_link) {
+ anv_descriptor_set_destroy(device, pool, set);
+ }
+
pool->next = 0;
pool->free_list = EMPTY;
anv_state_stream_finish(&pool->surface_state_stream);
@@ -630,6 +646,8 @@ anv_descriptor_set_destroy(struct anv_device *device,
entry->size = set->size;
pool->free_list = (char *) entry - pool->data;
}
+
+ list_del(&set->pool_link);
}
VkResult anv_AllocateDescriptorSets(
@@ -652,6 +670,8 @@ VkResult anv_AllocateDescriptorSets(
if (result != VK_SUCCESS)
break;
+ list_addtail(&set->pool_link, &pool->desc_sets);
+
pDescriptorSets[i] = anv_descriptor_set_to_handle(set);
}
diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index f44b046cf5d..99b512a0387 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -128,6 +128,8 @@ anv_physical_device_init_heaps(struct anv_physical_device *device, int fd)
*/
device->memory.heap_count = 1;
device->memory.heaps[0] = (struct anv_memory_heap) {
+ .vma_start = LOW_HEAP_MIN_ADDRESS,
+ .vma_size = LOW_HEAP_SIZE,
.size = heap_size,
.flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
.supports_48bit_addresses = false,
@@ -147,11 +149,19 @@ anv_physical_device_init_heaps(struct anv_physical_device *device, int fd)
device->memory.heap_count = 2;
device->memory.heaps[0] = (struct anv_memory_heap) {
+ .vma_start = HIGH_HEAP_MIN_ADDRESS,
+ /* Leave the last 4GiB out of the high vma range, so that no state
+ * base address + size can overflow 48 bits. For more information see
+ * the comment about Wa32bitGeneralStateOffset in anv_allocator.c
+ */
+ .vma_size = gtt_size - (1ull << 32) - HIGH_HEAP_MIN_ADDRESS,
.size = heap_size_48bit,
.flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
.supports_48bit_addresses = true,
};
device->memory.heaps[1] = (struct anv_memory_heap) {
+ .vma_start = LOW_HEAP_MIN_ADDRESS,
+ .vma_size = LOW_HEAP_SIZE,
.size = heap_size_32bit,
.flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
.supports_48bit_addresses = false,
@@ -1029,7 +1039,7 @@ void anv_GetPhysicalDeviceProperties(
.maxPerStageDescriptorSampledImages = max_samplers,
.maxPerStageDescriptorStorageImages = max_images,
.maxPerStageDescriptorInputAttachments = 64,
- .maxPerStageResources = 250,
+ .maxPerStageResources = MAX_BINDING_TABLE_SIZE - MAX_RTS,
.maxDescriptorSetSamplers = 6 * max_samplers, /* number of stages * maxPerStageDescriptorSamplers */
.maxDescriptorSetUniformBuffers = 6 * 64, /* number of stages * maxPerStageDescriptorUniformBuffers */
.maxDescriptorSetUniformBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2,
@@ -1068,7 +1078,7 @@ void anv_GetPhysicalDeviceProperties(
16 * devinfo->max_cs_threads,
16 * devinfo->max_cs_threads,
},
- .subPixelPrecisionBits = 4 /* FIXME */,
+ .subPixelPrecisionBits = 8,
.subTexelPrecisionBits = 4 /* FIXME */,
.mipmapPrecisionBits = 4 /* FIXME */,
.maxDrawIndexedIndexValue = UINT32_MAX,
@@ -1806,18 +1816,16 @@ VkResult anv_CreateDevice(
}
/* keep the page with address zero out of the allocator */
- util_vma_heap_init(&device->vma_lo, LOW_HEAP_MIN_ADDRESS, LOW_HEAP_SIZE);
- device->vma_lo_available =
- physical_device->memory.heaps[physical_device->memory.heap_count - 1].size;
-
- /* Leave the last 4GiB out of the high vma range, so that no state base
- * address + size can overflow 48 bits. For more information see the
- * comment about Wa32bitGeneralStateOffset in anv_allocator.c
- */
- util_vma_heap_init(&device->vma_hi, HIGH_HEAP_MIN_ADDRESS,
- HIGH_HEAP_SIZE);
+ struct anv_memory_heap *low_heap =
+ &physical_device->memory.heaps[physical_device->memory.heap_count - 1];
+ util_vma_heap_init(&device->vma_lo, low_heap->vma_start, low_heap->vma_size);
+ device->vma_lo_available = low_heap->size;
+
+ struct anv_memory_heap *high_heap =
+ &physical_device->memory.heaps[0];
+ util_vma_heap_init(&device->vma_hi, high_heap->vma_start, high_heap->vma_size);
device->vma_hi_available = physical_device->memory.heap_count == 1 ? 0 :
- physical_device->memory.heaps[0].size;
+ high_heap->size;
}
/* As per spec, the driver implementation may deny requests to acquire
@@ -1866,7 +1874,7 @@ VkResult anv_CreateDevice(
result = vk_error(VK_ERROR_INITIALIZATION_FAILED);
goto fail_mutex;
}
- if (pthread_cond_init(&device->queue_submit, NULL) != 0) {
+ if (pthread_cond_init(&device->queue_submit, &condattr) != 0) {
pthread_condattr_destroy(&condattr);
result = vk_error(VK_ERROR_INITIALIZATION_FAILED);
goto fail_mutex;
@@ -2276,8 +2284,11 @@ anv_vma_free(struct anv_device *device, struct anv_bo *bo)
util_vma_heap_free(&device->vma_lo, addr_48b, bo->size);
device->vma_lo_available += bo->size;
} else {
- assert(addr_48b >= HIGH_HEAP_MIN_ADDRESS &&
- addr_48b <= HIGH_HEAP_MAX_ADDRESS);
+ MAYBE_UNUSED const struct anv_physical_device *physical_device =
+ &device->instance->physicalDevice;
+ assert(addr_48b >= physical_device->memory.heaps[0].vma_start &&
+ addr_48b < (physical_device->memory.heaps[0].vma_start +
+ physical_device->memory.heaps[0].vma_size));
util_vma_heap_free(&device->vma_hi, addr_48b, bo->size);
device->vma_hi_available += bo->size;
}
diff --git a/src/intel/vulkan/anv_extensions.py b/src/intel/vulkan/anv_extensions.py
index 22bad94e5b8..c898136e88e 100644
--- a/src/intel/vulkan/anv_extensions.py
+++ b/src/intel/vulkan/anv_extensions.py
@@ -71,18 +71,18 @@ def __init__(self, version, enable):
EXTENSIONS = [
Extension('VK_ANDROID_external_memory_android_hardware_buffer', 3, 'ANDROID'),
Extension('VK_ANDROID_native_buffer', 5, 'ANDROID'),
- Extension('VK_KHR_8bit_storage', 1, 'device->info.gen >= 8'),
+ Extension('VK_KHR_8bit_storage', 1, 'device->info.gen >= 8 && !ANDROID'),
Extension('VK_KHR_16bit_storage', 1, 'device->info.gen >= 8'),
Extension('VK_KHR_bind_memory2', 1, True),
- Extension('VK_KHR_create_renderpass2', 1, True),
+ Extension('VK_KHR_create_renderpass2', 1, '!ANDROID'),
Extension('VK_KHR_dedicated_allocation', 1, True),
- Extension('VK_KHR_depth_stencil_resolve', 1, True),
+ Extension('VK_KHR_depth_stencil_resolve', 1, '!ANDROID'),
Extension('VK_KHR_descriptor_update_template', 1, True),
Extension('VK_KHR_device_group', 1, True),
Extension('VK_KHR_device_group_creation', 1, True),
Extension('VK_KHR_display', 23, 'VK_USE_PLATFORM_DISPLAY_KHR'),
Extension('VK_KHR_draw_indirect_count', 1, True),
- Extension('VK_KHR_driver_properties', 1, True),
+ Extension('VK_KHR_driver_properties', 1, '!ANDROID'),
Extension('VK_KHR_external_fence', 1,
'device->has_syncobj_wait'),
Extension('VK_KHR_external_fence_capabilities', 1, True),
@@ -128,14 +128,15 @@ def __init__(self, version, enable):
'device->has_context_priority'),
Extension('VK_EXT_pci_bus_info', 2, True),
Extension('VK_EXT_post_depth_coverage', 1, 'device->info.gen >= 9'),
+ Extension('VK_EXT_queue_family_foreign', 1, 'ANDROID'),
Extension('VK_EXT_sampler_filter_minmax', 1, 'device->info.gen >= 9'),
Extension('VK_EXT_scalar_block_layout', 1, True),
Extension('VK_EXT_shader_viewport_index_layer', 1, True),
Extension('VK_EXT_shader_stencil_export', 1, 'device->info.gen >= 9'),
Extension('VK_EXT_transform_feedback', 1, True),
Extension('VK_EXT_vertex_attribute_divisor', 3, True),
- Extension('VK_GOOGLE_decorate_string', 1, True),
- Extension('VK_GOOGLE_hlsl_functionality1', 1, True),
+ Extension('VK_GOOGLE_decorate_string', 1, '!ANDROID'),
+ Extension('VK_GOOGLE_hlsl_functionality1', 1, '!ANDROID'),
]
class VkVersion:
diff --git a/src/intel/vulkan/anv_intel.c b/src/intel/vulkan/anv_intel.c
index 08bff9585bc..f6b9584b410 100644
--- a/src/intel/vulkan/anv_intel.c
+++ b/src/intel/vulkan/anv_intel.c
@@ -64,7 +64,8 @@ VkResult anv_CreateDmaBufImageINTEL(
.samples = 1,
/* FIXME: Need a way to use X tiling to allow scanout */
.tiling = VK_IMAGE_TILING_OPTIMAL,
- .usage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
+ .usage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT |
+ VK_IMAGE_USAGE_SAMPLED_BIT,
.flags = 0,
}},
pAllocator, &image_h);
diff --git a/src/intel/vulkan/anv_nir_lower_ycbcr_textures.c b/src/intel/vulkan/anv_nir_lower_ycbcr_textures.c
index 104c58dc5e2..0567a1be939 100644
--- a/src/intel/vulkan/anv_nir_lower_ycbcr_textures.c
+++ b/src/intel/vulkan/anv_nir_lower_ycbcr_textures.c
@@ -269,6 +269,7 @@ create_plane_tex_instr_implicit(struct ycbcr_state *state,
tex->texture_index = old_tex->texture_index;
tex->texture_array_size = old_tex->texture_array_size;
tex->sampler_index = old_tex->sampler_index;
+ tex->is_array = old_tex->is_array;
nir_ssa_dest_init(&tex->instr, &tex->dest,
old_tex->dest.ssa.num_components,
diff --git a/src/intel/vulkan/anv_pass.c b/src/intel/vulkan/anv_pass.c
index 02f2be60e02..a1c72395831 100644
--- a/src/intel/vulkan/anv_pass.c
+++ b/src/intel/vulkan/anv_pass.c
@@ -178,12 +178,28 @@ anv_render_pass_compile(struct anv_render_pass *pass)
* subpasses and checking to see if any of them don't have an external
* dependency. Or, we could just be lazy and add a couple extra flushes.
* We choose to be lazy.
+ *
+ * From the documentation for vkCmdNextSubpass:
+ *
+ * "Moving to the next subpass automatically performs any multisample
+ * resolve operations in the subpass being ended. End-of-subpass
+ * multisample resolves are treated as color attachment writes for the
+ * purposes of synchronization. This applies to resolve operations for
+ * both color and depth/stencil attachments. That is, they are
+ * considered to execute in the
+ * VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT pipeline stage and
+ * their writes are synchronized with
+ * VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT."
+ *
+ * Therefore, the above flags concerning color attachments also apply to
+ * color and depth/stencil resolve attachments.
*/
if (all_usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) {
pass->subpass_flushes[0] |=
ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
}
- if (all_usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) {
+ if (all_usage & (VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT |
+ VK_IMAGE_USAGE_TRANSFER_DST_BIT)) {
pass->subpass_flushes[pass->subpass_count] |=
ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
}
diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c
index be869cfa061..1bdc896e708 100644
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -377,12 +377,12 @@ populate_wm_prog_key(const struct gen_device_info *devinfo,
* harmless to compute it and then let dead-code take care of it.
*/
if (ms_info->rasterizationSamples > 1) {
- key->persample_interp =
+ key->persample_interp = ms_info->sampleShadingEnable &&
(ms_info->minSampleShading * ms_info->rasterizationSamples) > 1;
key->multisample_fbo = true;
}
- key->frag_coord_adds_sample_pos = ms_info->sampleShadingEnable;
+ key->frag_coord_adds_sample_pos = key->persample_interp;
}
}
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index 110b2ccf023..9979b832a7b 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -120,12 +120,9 @@ struct gen_l3_config;
#define INSTRUCTION_STATE_POOL_MIN_ADDRESS 0x000180000000ULL /* 6 GiB */
#define INSTRUCTION_STATE_POOL_MAX_ADDRESS 0x0001bfffffffULL
#define HIGH_HEAP_MIN_ADDRESS 0x0001c0000000ULL /* 7 GiB */
-#define HIGH_HEAP_MAX_ADDRESS 0xfffeffffffffULL
#define LOW_HEAP_SIZE \
(LOW_HEAP_MAX_ADDRESS - LOW_HEAP_MIN_ADDRESS + 1)
-#define HIGH_HEAP_SIZE \
- (HIGH_HEAP_MAX_ADDRESS - HIGH_HEAP_MIN_ADDRESS + 1)
#define DYNAMIC_STATE_POOL_SIZE \
(DYNAMIC_STATE_POOL_MAX_ADDRESS - DYNAMIC_STATE_POOL_MIN_ADDRESS + 1)
#define BINDING_TABLE_POOL_SIZE \
@@ -163,6 +160,18 @@ struct gen_l3_config;
#define MAX_GEN8_IMAGES 8
#define MAX_PUSH_DESCRIPTORS 32 /* Minimum requirement */
+/* From the Skylake PRM Vol. 7 "Binding Table Surface State Model":
+ *
+ * "The surface state model is used when a Binding Table Index (specified
+ * in the message descriptor) of less than 240 is specified. In this model,
+ * the Binding Table Index is used to index into the binding table, and the
+ * binding table entry contains a pointer to the SURFACE_STATE."
+ *
+ * Binding table values above 240 are used for various things in the hardware
+ * such as stateless, stateless with incoherent cache, SLM, and bindless.
+ */
+#define MAX_BINDING_TABLE_SIZE 240
+
/* The kernel relocation API has a limitation of a 32-bit delta value
* applied to the address before it is written which, in spite of it being
* unsigned, is treated as signed . Because of the way that this maps to
@@ -733,7 +742,7 @@ struct anv_state_table {
struct anv_free_entry *map;
uint32_t size;
struct anv_block_state state;
- struct u_vector mmap_cleanups;
+ struct u_vector cleanups;
};
struct anv_state_pool {
@@ -894,6 +903,8 @@ struct anv_memory_heap {
VkMemoryHeapFlags flags;
/* Driver-internal book-keeping */
+ uint64_t vma_start;
+ uint64_t vma_size;
bool supports_48bit_addresses;
};
@@ -1449,10 +1460,10 @@ _anv_combine_address(struct anv_batch *batch, void *location,
*/
/* TC=LLC/eLLC, LeCC=WB, LRUM=3, L3CC=WB */
-#define GEN9_MOCS 2
+#define GEN9_MOCS (2 << 1)
/* TC=LLC/eLLC, LeCC=WB, LRUM=3, L3CC=WB */
-#define GEN9_EXTERNAL_MOCS 1
+#define GEN9_EXTERNAL_MOCS (1 << 1)
/* Cannonlake MOCS defines are duplicates of Skylake MOCS defines. */
#define GEN10_MOCS GEN9_MOCS
@@ -1581,6 +1592,10 @@ struct anv_descriptor_set {
uint32_t size;
uint32_t buffer_count;
struct anv_buffer_view *buffer_views;
+
+ /* Link to descriptor pool's desc_sets list . */
+ struct list_head pool_link;
+
struct anv_descriptor descriptors[0];
};
@@ -1614,6 +1629,8 @@ struct anv_descriptor_pool {
struct anv_state_stream surface_state_stream;
void *surface_state_free_list;
+ struct list_head desc_sets;
+
char data[0];
};
@@ -3045,7 +3062,13 @@ anv_can_sample_with_hiz(const struct gen_device_info * const devinfo,
if (!(image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT))
return false;
- if (devinfo->gen < 8)
+ /* Allow this feature on BDW even though it is disabled in the BDW devinfo
+ * struct. There's documentation which suggests that this feature actually
+ * reduces performance on BDW, but it has only been observed to help so
+ * far. Sampling fast-cleared blocks on BDW must also be handled with care
+ * (see depth_stencil_attachment_compute_aux_usage() for more info).
+ */
+ if (devinfo->gen != 8 && !devinfo->has_sample_with_hiz)
return false;
return image->samples == 1;
diff --git a/src/intel/vulkan/anv_util.c b/src/intel/vulkan/anv_util.c
index 1159ccecc6a..b00342d8f2b 100644
--- a/src/intel/vulkan/anv_util.c
+++ b/src/intel/vulkan/anv_util.c
@@ -76,6 +76,77 @@ __anv_perf_warn(struct anv_instance *instance, const void *object,
intel_logw("%s:%d: PERF: %s", file, line, buffer);
}
+const char *
+vk_Result_to_str_pri(VkResult input)
+{
+ switch(input) {
+ case -1000244000:
+ return "VK_ERROR_INVALID_DEVICE_ADDRESS_EXT";
+ case -1000174001:
+ return "VK_ERROR_NOT_PERMITTED_EXT";
+ case -1000161000:
+ return "VK_ERROR_FRAGMENTATION_EXT";
+ case -1000158000:
+ return "VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT";
+ case -1000072003:
+ return "VK_ERROR_INVALID_EXTERNAL_HANDLE";
+ case -1000069000:
+ return "VK_ERROR_OUT_OF_POOL_MEMORY";
+ case -1000012000:
+ return "VK_ERROR_INVALID_SHADER_NV";
+ case -1000011001:
+ return "VK_ERROR_VALIDATION_FAILED_EXT";
+ case -1000003001:
+ return "VK_ERROR_INCOMPATIBLE_DISPLAY_KHR";
+ case -1000001004:
+ return "VK_ERROR_OUT_OF_DATE_KHR";
+ case -1000000001:
+ return "VK_ERROR_NATIVE_WINDOW_IN_USE_KHR";
+ case -1000000000:
+ return "VK_ERROR_SURFACE_LOST_KHR";
+ case -12:
+ return "VK_ERROR_FRAGMENTED_POOL";
+ case -11:
+ return "VK_ERROR_FORMAT_NOT_SUPPORTED";
+ case -10:
+ return "VK_ERROR_TOO_MANY_OBJECTS";
+ case -9:
+ return "VK_ERROR_INCOMPATIBLE_DRIVER";
+ case -8:
+ return "VK_ERROR_FEATURE_NOT_PRESENT";
+ case -7:
+ return "VK_ERROR_EXTENSION_NOT_PRESENT";
+ case -6:
+ return "VK_ERROR_LAYER_NOT_PRESENT";
+ case -5:
+ return "VK_ERROR_MEMORY_MAP_FAILED";
+ case -4:
+ return "VK_ERROR_DEVICE_LOST";
+ case -3:
+ return "VK_ERROR_INITIALIZATION_FAILED";
+ case -2:
+ return "VK_ERROR_OUT_OF_DEVICE_MEMORY";
+ case -1:
+ return "VK_ERROR_OUT_OF_HOST_MEMORY";
+ case 0:
+ return "VK_SUCCESS";
+ case 1:
+ return "VK_NOT_READY";
+ case 2:
+ return "VK_TIMEOUT";
+ case 3:
+ return "VK_EVENT_SET";
+ case 4:
+ return "VK_EVENT_RESET";
+ case 5:
+ return "VK_INCOMPLETE";
+ case 1000001003:
+ return "VK_SUBOPTIMAL_KHR";
+ default:
+ unreachable("Undefined enum value.");
+ }
+}
+
VkResult
__vk_errorv(struct anv_instance *instance, const void *object,
VkDebugReportObjectTypeEXT type, VkResult error,
@@ -84,7 +155,7 @@ __vk_errorv(struct anv_instance *instance, const void *object,
char buffer[256];
char report[512];
- const char *error_str = vk_Result_to_str(error);
+ const char *error_str = vk_Result_to_str_pri(error);
if (format) {
vsnprintf(buffer, sizeof(buffer), format, ap);
diff --git a/src/intel/vulkan/gen7_cmd_buffer.c b/src/intel/vulkan/gen7_cmd_buffer.c
index 352892aee33..380283bdd56 100644
--- a/src/intel/vulkan/gen7_cmd_buffer.c
+++ b/src/intel/vulkan/gen7_cmd_buffer.c
@@ -70,12 +70,36 @@ gen7_cmd_buffer_emit_scissor(struct anv_cmd_buffer *cmd_buffer)
};
const int max = 0xffff;
+
+ uint32_t y_min = s->offset.y;
+ uint32_t x_min = s->offset.x;
+ uint32_t y_max = s->offset.y + s->extent.height - 1;
+ uint32_t x_max = s->offset.x + s->extent.width - 1;
+
+ /* Do this math using int64_t so overflow gets clamped correctly. */
+ if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
+ y_min = clamp_int64((uint64_t) y_min,
+ cmd_buffer->state.render_area.offset.y, max);
+ x_min = clamp_int64((uint64_t) x_min,
+ cmd_buffer->state.render_area.offset.x, max);
+ y_max = clamp_int64((uint64_t) y_max, 0,
+ cmd_buffer->state.render_area.offset.y +
+ cmd_buffer->state.render_area.extent.height - 1);
+ x_max = clamp_int64((uint64_t) x_max, 0,
+ cmd_buffer->state.render_area.offset.x +
+ cmd_buffer->state.render_area.extent.width - 1);
+ } else if (fb) {
+ y_min = clamp_int64((uint64_t) y_min, 0, max);
+ x_min = clamp_int64((uint64_t) x_min, 0, max);
+ y_max = clamp_int64((uint64_t) y_max, 0, fb->height - 1);
+ x_max = clamp_int64((uint64_t) x_max, 0, fb->width - 1);
+ }
+
struct GEN7_SCISSOR_RECT scissor = {
- /* Do this math using int64_t so overflow gets clamped correctly. */
- .ScissorRectangleYMin = clamp_int64(s->offset.y, 0, max),
- .ScissorRectangleXMin = clamp_int64(s->offset.x, 0, max),
- .ScissorRectangleYMax = clamp_int64((uint64_t) s->offset.y + s->extent.height - 1, 0, fb->height - 1),
- .ScissorRectangleXMax = clamp_int64((uint64_t) s->offset.x + s->extent.width - 1, 0, fb->width - 1)
+ .ScissorRectangleYMin = y_min,
+ .ScissorRectangleXMin = x_min,
+ .ScissorRectangleYMax = y_max,
+ .ScissorRectangleXMax = x_max
};
if (s->extent.width <= 0 || s->extent.height <= 0) {
diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c
index d980ec428d0..a3994f5870c 100644
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -2653,7 +2653,7 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer)
anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) {
sob.SOBufferIndex = idx;
- if (cmd_buffer->state.xfb_enabled && xfb->buffer) {
+ if (cmd_buffer->state.xfb_enabled && xfb->buffer && xfb->size != 0) {
sob.SOBufferEnable = true;
sob.MOCS = cmd_buffer->device->default_mocs,
sob.StreamOffsetWriteEnable = false;
diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c
index d2142ae42c2..3e13a12d776 100644
--- a/src/intel/vulkan/genX_pipeline.c
+++ b/src/intel/vulkan/genX_pipeline.c
@@ -464,6 +464,7 @@ emit_rs_state(struct anv_pipeline *pipeline,
sf.TriangleStripListProvokingVertexSelect = 0;
sf.LineStripListProvokingVertexSelect = 0;
sf.TriangleFanProvokingVertexSelect = 1;
+ sf.VertexSubPixelPrecisionSelect = _8Bit;
const struct brw_vue_prog_data *last_vue_prog_data =
anv_pipeline_get_last_vue_prog_data(pipeline);
@@ -1077,6 +1078,10 @@ emit_3dstate_clip(struct anv_pipeline *pipeline,
clip.APIMode = APIMODE_D3D,
clip.ViewportXYClipTestEnable = true;
+#if GEN_GEN >= 8
+ clip.VertexSubPixelPrecisionSelect = _8Bit;
+#endif
+
clip.ClipMode = CLIPMODE_NORMAL;
clip.TriangleStripListProvokingVertexSelect = 0;
@@ -1211,13 +1216,30 @@ emit_3dstate_streamout(struct anv_pipeline *pipeline,
hole_dwords -= 4;
}
+ int varying = output->location;
+ uint8_t component_mask = output->component_mask;
+ /* VARYING_SLOT_PSIZ contains three scalar fields packed together:
+ * - VARYING_SLOT_LAYER in VARYING_SLOT_PSIZ.y
+ * - VARYING_SLOT_VIEWPORT in VARYING_SLOT_PSIZ.z
+ * - VARYING_SLOT_PSIZ in VARYING_SLOT_PSIZ.w
+ */
+ if (varying == VARYING_SLOT_LAYER) {
+ varying = VARYING_SLOT_PSIZ;
+ component_mask = 1 << 1; // SO_DECL_COMPMASK_Y
+ } else if (varying == VARYING_SLOT_VIEWPORT) {
+ varying = VARYING_SLOT_PSIZ;
+ component_mask = 1 << 2; // SO_DECL_COMPMASK_Z
+ } else if (varying == VARYING_SLOT_PSIZ) {
+ component_mask = 1 << 3; // SO_DECL_COMPMASK_W
+ }
+
next_offset[buffer] = output->offset +
- __builtin_popcount(output->component_mask) * 4;
+ __builtin_popcount(component_mask) * 4;
so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) {
.OutputBufferSlot = buffer,
- .RegisterIndex = vue_map->varying_to_slot[output->location],
- .ComponentMask = output->component_mask,
+ .RegisterIndex = vue_map->varying_to_slot[varying],
+ .ComponentMask = component_mask,
};
}
@@ -2065,9 +2087,29 @@ compute_pipeline_create(
vfe.URBEntryAllocationSize = GEN_GEN <= 7 ? 0 : 2;
vfe.CURBEAllocationSize = vfe_curbe_allocation;
- vfe.PerThreadScratchSpace = get_scratch_space(cs_bin);
- vfe.ScratchSpaceBasePointer =
- get_scratch_address(pipeline, MESA_SHADER_COMPUTE, cs_bin);
+ if (cs_bin->prog_data->total_scratch) {
+ if (GEN_GEN >= 8) {
+ /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
+ * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
+ */
+ vfe.PerThreadScratchSpace =
+ ffs(cs_bin->prog_data->total_scratch) - 11;
+ } else if (GEN_IS_HASWELL) {
+ /* Haswell's Per Thread Scratch Space is in the range [0, 10]
+ * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
+ */
+ vfe.PerThreadScratchSpace =
+ ffs(cs_bin->prog_data->total_scratch) - 12;
+ } else {
+ /* IVB and BYT use the range [0, 11] to mean [1kB, 12kB]
+ * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
+ */
+ vfe.PerThreadScratchSpace =
+ cs_bin->prog_data->total_scratch / 1024 - 1;
+ }
+ vfe.ScratchSpaceBasePointer =
+ get_scratch_address(pipeline, MESA_SHADER_COMPUTE, cs_bin);
+ }
}
struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
diff --git a/src/intel/vulkan/genX_query.c b/src/intel/vulkan/genX_query.c
index 794d92dc6c9..6c1c76aeef0 100644
--- a/src/intel/vulkan/genX_query.c
+++ b/src/intel/vulkan/genX_query.c
@@ -356,14 +356,23 @@ emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
}
static void
-emit_query_availability(struct anv_cmd_buffer *cmd_buffer,
- struct anv_address addr)
+emit_query_mi_availability(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_address addr,
+ bool available)
+{
+ genX(cmd_buffer_mi_memset)(cmd_buffer, addr, available, 8);
+}
+
+static void
+emit_query_pc_availability(struct anv_cmd_buffer *cmd_buffer,
+ struct anv_address addr,
+ bool available)
{
anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
pc.DestinationAddressType = DAT_PPGTT;
pc.PostSyncOperation = WriteImmediateData;
pc.Address = addr;
- pc.ImmediateData = 1;
+ pc.ImmediateData = available;
}
}
@@ -376,12 +385,40 @@ emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
struct anv_query_pool *pool,
uint32_t first_index, uint32_t num_queries)
{
- for (uint32_t i = 0; i < num_queries; i++) {
- struct anv_address slot_addr =
- anv_query_address(pool, first_index + i);
- genX(cmd_buffer_mi_memset)(cmd_buffer, anv_address_add(slot_addr, 8),
- 0, pool->stride - 8);
- emit_query_availability(cmd_buffer, slot_addr);
+ switch (pool->type) {
+ case VK_QUERY_TYPE_OCCLUSION:
+ case VK_QUERY_TYPE_TIMESTAMP:
+ /* These queries are written with a PIPE_CONTROL so clear them using the
+ * PIPE_CONTROL as well so we don't have to synchronize between 2 types
+ * of operations.
+ */
+ assert((pool->stride % 8) == 0);
+ for (uint32_t i = 0; i < num_queries; i++) {
+ struct anv_address slot_addr =
+ anv_query_address(pool, first_index + i);
+
+ for (uint32_t qword = 1; qword < (pool->stride / 8); qword++) {
+ emit_query_pc_availability(cmd_buffer,
+ anv_address_add(slot_addr, qword * 8),
+ false);
+ }
+ emit_query_pc_availability(cmd_buffer, slot_addr, true);
+ }
+ break;
+
+ case VK_QUERY_TYPE_PIPELINE_STATISTICS:
+ case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
+ for (uint32_t i = 0; i < num_queries; i++) {
+ struct anv_address slot_addr =
+ anv_query_address(pool, first_index + i);
+ genX(cmd_buffer_mi_memset)(cmd_buffer, anv_address_add(slot_addr, 8),
+ 0, pool->stride - 8);
+ emit_query_mi_availability(cmd_buffer, slot_addr, true);
+ }
+ break;
+
+ default:
+ unreachable("Unsupported query type");
}
}
@@ -394,11 +431,28 @@ void genX(CmdResetQueryPool)(
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
- for (uint32_t i = 0; i < queryCount; i++) {
- anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdm) {
- sdm.Address = anv_query_address(pool, firstQuery + i);
- sdm.ImmediateData = 0;
+ switch (pool->type) {
+ case VK_QUERY_TYPE_OCCLUSION:
+ case VK_QUERY_TYPE_TIMESTAMP:
+ for (uint32_t i = 0; i < queryCount; i++) {
+ emit_query_pc_availability(cmd_buffer,
+ anv_query_address(pool, firstQuery + i),
+ false);
+ }
+ break;
+
+ case VK_QUERY_TYPE_PIPELINE_STATISTICS:
+ case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
+ for (uint32_t i = 0; i < queryCount; i++) {
+ emit_query_mi_availability(cmd_buffer,
+ anv_query_address(pool, firstQuery + i),
+ false);
}
+ break;
+ }
+
+ default:
+ unreachable("Unsupported query type");
}
}
@@ -511,9 +565,9 @@ void genX(CmdBeginQueryIndexedEXT)(
void genX(CmdEndQuery)(
VkCommandBuffer commandBuffer,
VkQueryPool queryPool,
- VkQueryControlFlags flags)
+ uint32_t query)
{
- genX(CmdEndQueryIndexedEXT)(commandBuffer, queryPool, flags, 0);
+ genX(CmdEndQueryIndexedEXT)(commandBuffer, queryPool, query, 0);
}
void genX(CmdEndQueryIndexedEXT)(
@@ -529,7 +583,7 @@ void genX(CmdEndQueryIndexedEXT)(
switch (pool->type) {
case VK_QUERY_TYPE_OCCLUSION:
emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 16));
- emit_query_availability(cmd_buffer, query_addr);
+ emit_query_pc_availability(cmd_buffer, query_addr, true);
break;
case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
@@ -548,7 +602,7 @@ void genX(CmdEndQueryIndexedEXT)(
offset += 16;
}
- emit_query_availability(cmd_buffer, query_addr);
+ emit_query_mi_availability(cmd_buffer, query_addr, true);
break;
}
@@ -559,7 +613,7 @@ void genX(CmdEndQueryIndexedEXT)(
}
emit_xfb_query(cmd_buffer, index, anv_address_add(query_addr, 16));
- emit_query_availability(cmd_buffer, query_addr);
+ emit_query_mi_availability(cmd_buffer, query_addr, true);
break;
default:
@@ -614,7 +668,7 @@ void genX(CmdWriteTimestamp)(
break;
}
- emit_query_availability(cmd_buffer, query_addr);
+ emit_query_pc_availability(cmd_buffer, query_addr, true);
/* When multiview is active the spec requires that N consecutive query
* indices are used, where N is the number of active views in the subpass.
@@ -817,7 +871,20 @@ void genX(CmdCopyQueryPoolResults)(
}
if ((flags & VK_QUERY_RESULT_WAIT_BIT) ||
- (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS)) {
+ (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) ||
+ /* Occlusion & timestamp queries are written using a PIPE_CONTROL and
+ * because we're about to copy values from MI commands, we need to
+ * stall the command streamer to make sure the PIPE_CONTROL values have
+ * landed, otherwise we could see inconsistent values & availability.
+ *
+ * From the vulkan spec:
+ *
+ * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
+ * previous uses of vkCmdResetQueryPool in the same queue, without
+ * any additional synchronization."
+ */
+ pool->type == VK_QUERY_TYPE_OCCLUSION ||
+ pool->type == VK_QUERY_TYPE_TIMESTAMP) {
cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT;
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
}
diff --git a/src/intel/vulkan/meson.build b/src/intel/vulkan/meson.build
index 05fdeca8c25..9adf6d95877 100644
--- a/src/intel/vulkan/meson.build
+++ b/src/intel/vulkan/meson.build
@@ -1,4 +1,4 @@
-# Copyright © 2017-2018 Intel Corporation
+# Copyright © 2017-2019 Intel Corporation
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
@@ -98,14 +98,15 @@ foreach g : [['70', ['gen7_cmd_buffer.c']], ['75', ['gen7_cmd_buffer.c']],
'anv_gen@0@'.format(_gen),
[anv_gen_files, g[1], anv_entrypoints[0], anv_extensions_h],
include_directories : [
- inc_common, inc_compiler, inc_drm_uapi, inc_intel, inc_vulkan_util,
- inc_vulkan_wsi,
+ inc_common, inc_compiler, inc_drm_uapi, inc_intel, inc_vulkan_wsi,
],
c_args : [
c_vis_args, no_override_init_args, c_sse2_args,
'-DGEN_VERSIONx10=@0@'.format(_gen),
],
- dependencies : [dep_libdrm, dep_valgrind, idep_nir_headers],
+ dependencies : [
+ dep_libdrm, dep_valgrind, idep_nir_headers, idep_genxml, idep_vulkan_util_headers,
+ ],
)
endforeach
@@ -144,6 +145,7 @@ anv_deps = [
dep_libdrm,
dep_valgrind,
idep_nir_headers,
+ idep_vulkan_util_headers,
]
anv_flags = [
c_vis_args,
@@ -178,9 +180,12 @@ endif
libanv_common = static_library(
'anv_common',
- [libanv_files, anv_entrypoints, anv_extensions_c, anv_extensions_h, sha1_h],
+ [
+ libanv_files, anv_entrypoints, anv_extensions_c, anv_extensions_h, sha1_h,
+ gen_xml_pack,
+ ],
include_directories : [
- inc_common, inc_intel, inc_compiler, inc_drm_uapi, inc_vulkan_util,
+ inc_common, inc_intel, inc_compiler, inc_drm_uapi,
inc_vulkan_wsi,
],
c_args : anv_flags,
@@ -191,16 +196,15 @@ libvulkan_intel = shared_library(
'vulkan_intel',
[files('anv_gem.c'), anv_entrypoints[0], anv_extensions_h],
include_directories : [
- inc_common, inc_intel, inc_compiler, inc_drm_uapi, inc_vulkan_util,
- inc_vulkan_wsi,
+ inc_common, inc_intel, inc_compiler, inc_drm_uapi, inc_vulkan_wsi,
],
link_whole : [libanv_common, libanv_gen_libs],
link_with : [
libintel_compiler, libintel_common, libintel_dev, libisl, libblorp,
- libvulkan_util, libvulkan_wsi, libmesa_util,
+ libvulkan_wsi, libmesa_util,
],
dependencies : [
- dep_thread, dep_dl, dep_m, anv_deps, idep_nir,
+ dep_thread, dep_dl, dep_m, anv_deps, idep_nir, idep_genxml, idep_vulkan_util
],
c_args : anv_flags,
link_args : ['-Wl,--build-id=sha1', ld_args_bsymbolic, ld_args_gc_sections],
@@ -212,16 +216,15 @@ if with_tests
'vulkan_intel_test',
[files('anv_gem_stubs.c'), anv_entrypoints[0], anv_extensions_h],
include_directories : [
- inc_common, inc_intel, inc_compiler, inc_drm_uapi, inc_vulkan_util,
- inc_vulkan_wsi,
+ inc_common, inc_intel, inc_compiler, inc_drm_uapi, inc_vulkan_wsi,
],
link_whole : libanv_common,
link_with : [
libanv_gen_libs, libintel_compiler, libintel_common, libintel_dev,
- libisl, libblorp, libvulkan_util, libvulkan_wsi, libmesa_util,
+ libisl, libblorp, libvulkan_wsi, libmesa_util,
],
dependencies : [
- dep_thread, dep_dl, dep_m, anv_deps, idep_nir,
+ dep_thread, dep_dl, dep_m, anv_deps, idep_nir, idep_vulkan_util
],
c_args : anv_flags,
)
@@ -236,9 +239,9 @@ if with_tests
['tests/@0@.c'.format(t), anv_entrypoints[0], anv_extensions_h],
c_args : [ c_sse2_args ],
link_with : libvulkan_intel_test,
- dependencies : [dep_libdrm, dep_thread, dep_m, dep_valgrind],
+ dependencies : [dep_libdrm, dep_thread, dep_m, dep_valgrind, idep_vulkan_util, ],
include_directories : [
- inc_common, inc_intel, inc_compiler, inc_vulkan_util, inc_vulkan_wsi,
+ inc_common, inc_intel, inc_compiler, inc_vulkan_wsi,
],
),
suite : ['intel'],
diff --git a/src/loader/loader_dri3_helper.c b/src/loader/loader_dri3_helper.c
index ad9b9d87b05..7d61c1df4fc 100644
--- a/src/loader/loader_dri3_helper.c
+++ b/src/loader/loader_dri3_helper.c
@@ -111,7 +111,7 @@ set_adaptive_sync_property(xcb_connection_t *conn, xcb_drawable_t drawable,
xcb_intern_atom_reply_t* reply;
xcb_void_cookie_t check;
- cookie = xcb_intern_atom(conn, 0, sizeof(name), name);
+ cookie = xcb_intern_atom(conn, 0, strlen(name), name);
reply = xcb_intern_atom_reply(conn, cookie, NULL);
if (reply == NULL)
return;
diff --git a/src/mapi/es1api/meson.build b/src/mapi/es1api/meson.build
index b0416e705a1..14ca49c1407 100644
--- a/src/mapi/es1api/meson.build
+++ b/src/mapi/es1api/meson.build
@@ -38,7 +38,7 @@ libglesv1_cm = shared_library(
include_directories : [inc_src, inc_include, inc_mapi],
link_with : libglapi,
dependencies : [dep_thread, dep_libdrm, dep_m, dep_dl],
- version : '1.0.0',
+ version : '1.1.0',
install : true,
)
diff --git a/src/mesa/drivers/dri/Android.mk b/src/mesa/drivers/dri/Android.mk
index 53ff4b4f632..60c8476a38a 100644
--- a/src/mesa/drivers/dri/Android.mk
+++ b/src/mesa/drivers/dri/Android.mk
@@ -49,11 +49,19 @@ MESA_DRI_WHOLE_STATIC_LIBRARIES := \
MESA_DRI_SHARED_LIBRARIES := \
libcutils \
libdl \
- libexpat \
libglapi \
liblog \
libz
+# If Android version >=8 MESA should static link libexpat else should dynamic link
+ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0)
+MESA_DRI_WHOLE_STATIC_LIBRARIES += \
+ libexpat
+else
+MESA_DRI_SHARED_LIBRARIES += \
+ libexpat
+endif
+
#-----------------------------------------------
# Build drivers and libmesa_dri_common
diff --git a/src/mesa/drivers/dri/i965/Android.mk b/src/mesa/drivers/dri/i965/Android.mk
index 1574c8834c9..97def8f03fe 100644
--- a/src/mesa/drivers/dri/i965/Android.mk
+++ b/src/mesa/drivers/dri/i965/Android.mk
@@ -274,6 +274,8 @@ LOCAL_LDFLAGS += $(MESA_DRI_LDFLAGS)
LOCAL_CFLAGS := \
$(MESA_DRI_CFLAGS)
+LOCAL_CFLAGS += -Wno-error
+
LOCAL_C_INCLUDES := \
$(MESA_DRI_C_INCLUDES) \
$(call generated-sources-dir-for,STATIC_LIBRARIES,libmesa_glsl,,) \
diff --git a/src/mesa/drivers/dri/i965/Makefile.am b/src/mesa/drivers/dri/i965/Makefile.am
index b562c6ea21c..0bda2897e8e 100644
--- a/src/mesa/drivers/dri/i965/Makefile.am
+++ b/src/mesa/drivers/dri/i965/Makefile.am
@@ -34,6 +34,8 @@ AM_CFLAGS = \
-I$(top_builddir)/src/util \
-I$(top_srcdir)/src/mesa/drivers/dri/common \
-I$(top_srcdir)/src/gtest/include \
+ -I$(top_builddir)/src/compiler \
+ -I$(top_srcdir)/src/compiler \
-I$(top_builddir)/src/compiler/glsl \
-I$(top_builddir)/src/compiler/nir \
-I$(top_srcdir)/src/compiler/nir \
diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.c b/src/mesa/drivers/dri/i965/brw_bufmgr.c
index f1675b191c1..43077e60da4 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.c
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.c
@@ -402,6 +402,8 @@ vma_alloc(struct brw_bufmgr *bufmgr,
/* Without softpin support, we let the kernel assign addresses. */
assert(brw_using_softpin(bufmgr));
+ alignment = ALIGN(alignment, PAGE_SIZE);
+
struct bo_cache_bucket *bucket = get_bucket_allocator(bufmgr, size);
uint64_t addr;
@@ -1487,7 +1489,7 @@ brw_bo_gem_export_to_prime(struct brw_bo *bo, int *prime_fd)
brw_bo_make_external(bo);
if (drmPrimeHandleToFD(bufmgr->fd, bo->gem_handle,
- DRM_CLOEXEC, prime_fd) != 0)
+ DRM_CLOEXEC | DRM_RDWR, prime_fd) != 0)
return -errno;
bo->reusable = false;
@@ -1717,6 +1719,9 @@ brw_bufmgr_init(struct gen_device_info *devinfo, int fd)
const uint64_t _4GB = 4ull << 30;
+ /* The STATE_BASE_ADDRESS size field can only hold 1 page shy of 4GB */
+ const uint64_t _4GB_minus_1 = _4GB - PAGE_SIZE;
+
if (devinfo->gen >= 8 && gtt_size > _4GB) {
bufmgr->initial_kflags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
@@ -1726,9 +1731,13 @@ brw_bufmgr_init(struct gen_device_info *devinfo, int fd)
bufmgr->initial_kflags |= EXEC_OBJECT_PINNED;
util_vma_heap_init(&bufmgr->vma_allocator[BRW_MEMZONE_LOW_4G],
- PAGE_SIZE, _4GB);
+ PAGE_SIZE, _4GB_minus_1);
+
+ /* Leave the last 4GB out of the high vma range, so that no state
+ * base address + size can overflow 48 bits.
+ */
util_vma_heap_init(&bufmgr->vma_allocator[BRW_MEMZONE_OTHER],
- 1 * _4GB, gtt_size - 1 * _4GB);
+ 1 * _4GB, gtt_size - 2 * _4GB);
} else if (devinfo->gen >= 10) {
/* Softpin landed in 4.5, but GVT used an aliasing PPGTT until
* kernel commit 6b3816d69628becb7ff35978aa0751798b4a940a in
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 505da9896b3..e4bc5fe99f0 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -893,6 +893,19 @@ brw_process_driconf_options(struct brw_context *brw)
ctx->Const.dri_config_options_sha1 = ralloc_array(brw, unsigned char, 20);
driComputeOptionsSha1(&brw->screen->optionCache,
ctx->Const.dri_config_options_sha1);
+
+ brw->screen->compiler->simd32_heuristics_control.grouped_sends_check =
+ driQueryOptionb(&brw->optionCache, "simd32_heuristic_grouped_check");
+ brw->screen->compiler->simd32_heuristics_control.max_grouped_sends =
+ driQueryOptioni(&brw->optionCache, "simd32_heuristic_grouped_sends");
+ brw->screen->compiler->simd32_heuristics_control.inst_count_check =
+ driQueryOptionb(&brw->optionCache, "simd32_heuristic_inst_check");
+ brw->screen->compiler->simd32_heuristics_control.inst_count_ratio =
+ driQueryOptionf(&brw->optionCache, "simd32_heuristic_inst_ratio");
+ brw->screen->compiler->simd32_heuristics_control.mrt_check =
+ driQueryOptionb(&brw->optionCache, "simd32_heuristic_mrt_check");
+ brw->screen->compiler->simd32_heuristics_control.max_mrts =
+ driQueryOptioni(&brw->optionCache, "simd32_heuristic_max_mrts");
}
GLboolean
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 66fe5b3a8a0..7237f39d286 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -686,6 +686,7 @@ enum brw_query_kind {
OA_COUNTERS,
OA_COUNTERS_RAW,
PIPELINE_STATS,
+ NULL_RENDERER,
};
struct brw_perf_query_register_prog {
@@ -1006,6 +1007,9 @@ struct brw_context
/* High bits of the last seen index buffer address (for workarounds). */
uint16_t last_bo_high_bits;
+
+ /* Used to understand is GPU state of primitive restart is up to date */
+ bool enable_cut_index;
} ib;
/* Active vertex program:
@@ -1246,6 +1250,7 @@ struct brw_context
int n_active_oa_queries;
int n_active_pipeline_stats_queries;
+ int n_active_null_renderers;
/* The number of queries depending on running OA counters which
* extends beyond brw_end_perf_query() since we need to wait until
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 2729a54e144..cdfa435a1f5 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -1652,11 +1652,17 @@ enum brw_pixel_shader_coverage_mask_mode {
#define GEN10_CACHE_MODE_SS 0x0e420
#define GEN10_FLOAT_BLEND_OPTIMIZATION_ENABLE (1 << 4)
-#define INSTPM 0x20c0
+#define INSTPM 0x20c0 /* Gen6-8 */
# define INSTPM_CONSTANT_BUFFER_ADDRESS_OFFSET_DISABLE (1 << 6)
+# define INSTPM_GLOBAL_DEBUG_ENABLE (1 << 4)
+# define INSTPM_MEDIA_INSTRUCTION_DISABLE (1 << 3)
+# define INSTPM_3D_RENDERER_INSTRUCTION_DISABLE (1 << 2)
+# define INSTPM_3D_STATE_INSTRUCTION_DISABLE (1 << 1)
#define CS_DEBUG_MODE2 0x20d8 /* Gen9+ */
# define CSDBG2_CONSTANT_BUFFER_ADDRESS_OFFSET_DISABLE (1 << 4)
+# define CSDBG2_MEDIA_INSTRUCTION_DISABLE (1 << 1)
+# define CSDBG2_3D_RENDERER_INSTRUCTION_DISABLE (1 << 0)
#define GEN7_RPSTAT1 0xA01C
#define GEN7_RPSTAT1_CURR_GT_FREQ_SHIFT 7
diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c
index dfbc45fe938..2f52899fcb0 100644
--- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
@@ -776,6 +776,14 @@ brw_upload_indices(struct brw_context *brw)
brw->ib.index_size = index_buffer->index_size;
brw->ctx.NewDriverState |= BRW_NEW_INDEX_BUFFER;
}
+
+ /* We need to re-emit an index buffer state each time
+ * when cut index flag is changed
+ */
+ if (brw->prim_restart.enable_cut_index != brw->ib.enable_cut_index) {
+ brw->ib.enable_cut_index = brw->prim_restart.enable_cut_index;
+ brw->ctx.NewDriverState |= BRW_NEW_INDEX_BUFFER;
+ }
}
const struct brw_tracked_state brw_indices = {
diff --git a/src/mesa/drivers/dri/i965/brw_link.cpp b/src/mesa/drivers/dri/i965/brw_link.cpp
index 2cbb1e0b879..95d87dc56fd 100644
--- a/src/mesa/drivers/dri/i965/brw_link.cpp
+++ b/src/mesa/drivers/dri/i965/brw_link.cpp
@@ -323,7 +323,6 @@ brw_link_shader(struct gl_context *ctx, struct gl_shader_program *shProg)
brw_shader_gather_info(prog->nir, prog);
- NIR_PASS_V(prog->nir, gl_nir_lower_samplers, shProg);
NIR_PASS_V(prog->nir, gl_nir_lower_atomics, shProg, false);
NIR_PASS_V(prog->nir, nir_lower_atomics_to_ssbo,
prog->nir->info.num_abos);
diff --git a/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp b/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp
index 256fdd8fc79..7e2a5b045dd 100644
--- a/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp
+++ b/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp
@@ -80,15 +80,15 @@ setup_vec4_image_param(uint32_t *params, uint32_t idx,
}
static void
-brw_setup_image_uniform_values(gl_shader_stage stage,
- struct brw_stage_prog_data *stage_prog_data,
- unsigned param_start_index,
- const gl_uniform_storage *storage)
+brw_setup_image_uniform_values(nir_variable *var,
+ struct brw_stage_prog_data *prog_data)
{
- uint32_t *param = &stage_prog_data->param[param_start_index];
+ unsigned param_start_index = var->data.driver_location / 4;
+ uint32_t *param = &prog_data->param[param_start_index];
+ unsigned num_images = MAX2(1, var->type->arrays_of_arrays_size());
- for (unsigned i = 0; i < MAX2(storage->array_elements, 1); i++) {
- const unsigned image_idx = storage->opaque[stage].index + i;
+ for (unsigned i = 0; i < num_images; i++) {
+ const unsigned image_idx = var->data.binding + i;
/* Upload the brw_image_param structure. The order is expected to match
* the BRW_IMAGE_PARAM_*_OFFSET defines.
@@ -150,6 +150,14 @@ brw_nir_setup_glsl_uniform(gl_shader_stage stage, nir_variable *var,
struct brw_stage_prog_data *stage_prog_data,
bool is_scalar)
{
+ if (var->type->without_array()->is_sampler())
+ return;
+
+ if (var->type->without_array()->is_image()) {
+ brw_setup_image_uniform_values(var, stage_prog_data);
+ return;
+ }
+
/* The data for our (non-builtin) uniforms is stored in a series of
* gl_uniform_storage structs for each subcomponent that
* glGetUniformLocation() could name. We know it's been set up in the same
@@ -162,15 +170,17 @@ brw_nir_setup_glsl_uniform(gl_shader_stage stage, nir_variable *var,
struct gl_uniform_storage *storage =
&prog->sh.data->UniformStorage[var->data.location + u];
- if (storage->builtin || storage->type->is_sampler())
+ /* We already handled samplers and images via the separate top-level
+ * variables created by gl_nir_lower_samplers_as_deref(), but they're
+ * still part of the structure's storage, and so we'll see them while
+ * walking it to set up the other regular fields. Just skip over them.
+ */
+ if (storage->builtin ||
+ storage->type->is_sampler() ||
+ storage->type->is_image())
continue;
- if (storage->type->is_image()) {
- brw_setup_image_uniform_values(stage, stage_prog_data,
- uniform_index, storage);
- uniform_index +=
- BRW_IMAGE_PARAM_SIZE * MAX2(storage->array_elements, 1);
- } else {
+ {
gl_constant_value *components = storage->storage;
unsigned vector_count = (MAX2(storage->array_elements, 1) *
storage->type->matrix_columns);
diff --git a/src/mesa/drivers/dri/i965/brw_performance_query.c b/src/mesa/drivers/dri/i965/brw_performance_query.c
index 10e3d024f17..85d14a83c7e 100644
--- a/src/mesa/drivers/dri/i965/brw_performance_query.c
+++ b/src/mesa/drivers/dri/i965/brw_performance_query.c
@@ -330,6 +330,12 @@ dump_perf_query_callback(GLuint id, void *query_void, void *brw_void)
o->Active ? "Active," : (o->Ready ? "Ready," : "Pending,"),
obj->pipeline_stats.bo ? "yes" : "no");
break;
+ case NULL_RENDERER:
+ DBG("%4d: %-6s %-8s NULL_RENDERER\n",
+ id,
+ o->Used ? "Dirty," : "New,",
+ o->Active ? "Active," : (o->Ready ? "Ready," : "Pending,"));
+ break;
default:
unreachable("Unknown query type");
break;
@@ -431,6 +437,10 @@ brw_get_perf_query_info(struct gl_context *ctx,
*n_active = brw->perfquery.n_active_pipeline_stats_queries;
break;
+ case NULL_RENDERER:
+ *n_active = brw->perfquery.n_active_null_renderers;
+ break;
+
default:
unreachable("Unknown query type");
break;
@@ -1020,6 +1030,7 @@ brw_begin_perf_query(struct gl_context *ctx,
struct brw_context *brw = brw_context(ctx);
struct brw_perf_query_object *obj = brw_perf_query(o);
const struct brw_perf_query_info *query = obj->query;
+ const struct gen_device_info *devinfo = &brw->screen->devinfo;
/* We can assume the frontend hides mistaken attempts to Begin a
* query object multiple times before its End. Similarly if an
@@ -1104,7 +1115,6 @@ brw_begin_perf_query(struct gl_context *ctx,
/* If the OA counters aren't already on, enable them. */
if (brw->perfquery.oa_stream_fd == -1) {
__DRIscreen *screen = brw->screen->driScrnPriv;
- const struct gen_device_info *devinfo = &brw->screen->devinfo;
/* The period_exponent gives a sampling period as follows:
* sample_period = timestamp_period * 2^(period_exponent + 1)
@@ -1250,6 +1260,23 @@ brw_begin_perf_query(struct gl_context *ctx,
++brw->perfquery.n_active_pipeline_stats_queries;
break;
+ case NULL_RENDERER:
+ ++brw->perfquery.n_active_null_renderers;
+ if (devinfo->gen >= 9) {
+ brw_load_register_imm32(brw, CS_DEBUG_MODE2,
+ REG_MASK(CSDBG2_3D_RENDERER_INSTRUCTION_DISABLE) |
+ CSDBG2_3D_RENDERER_INSTRUCTION_DISABLE);
+ } else {
+ brw_load_register_imm32(brw, INSTPM,
+ REG_MASK(INSTPM_3D_RENDERER_INSTRUCTION_DISABLE |
+ INSTPM_MEDIA_INSTRUCTION_DISABLE) |
+ INSTPM_3D_RENDERER_INSTRUCTION_DISABLE |
+ INSTPM_MEDIA_INSTRUCTION_DISABLE);
+ }
+ brw_emit_pipe_control_flush(brw,
+ PIPE_CONTROL_LRI_WRITE_IMMEDIATE);
+ break;
+
default:
unreachable("Unknown query type");
break;
@@ -1270,6 +1297,7 @@ brw_end_perf_query(struct gl_context *ctx,
{
struct brw_context *brw = brw_context(ctx);
struct brw_perf_query_object *obj = brw_perf_query(o);
+ const struct gen_device_info *devinfo = &brw->screen->devinfo;
DBG("End(%d)\n", o->Id);
@@ -1312,6 +1340,21 @@ brw_end_perf_query(struct gl_context *ctx,
--brw->perfquery.n_active_pipeline_stats_queries;
break;
+ case NULL_RENDERER:
+ if (--brw->perfquery.n_active_null_renderers == 0) {
+ if (devinfo->gen >= 9) {
+ brw_load_register_imm32(brw, CS_DEBUG_MODE2,
+ REG_MASK(CSDBG2_3D_RENDERER_INSTRUCTION_DISABLE));
+ } else {
+ brw_load_register_imm32(brw, INSTPM,
+ REG_MASK(INSTPM_3D_RENDERER_INSTRUCTION_DISABLE |
+ INSTPM_MEDIA_INSTRUCTION_DISABLE));
+ }
+ brw_emit_pipe_control_flush(brw,
+ PIPE_CONTROL_LRI_WRITE_IMMEDIATE);
+ }
+ break;
+
default:
unreachable("Unknown query type");
break;
@@ -1337,6 +1380,9 @@ brw_wait_perf_query(struct gl_context *ctx, struct gl_perf_query_object *o)
bo = obj->pipeline_stats.bo;
break;
+ case NULL_RENDERER:
+ break;
+
default:
unreachable("Unknown query type");
break;
@@ -1387,6 +1433,8 @@ brw_is_perf_query_ready(struct gl_context *ctx,
return (obj->pipeline_stats.bo &&
!brw_batch_references(&brw->batch, obj->pipeline_stats.bo) &&
!brw_bo_busy(obj->pipeline_stats.bo));
+ case NULL_RENDERER:
+ return true;
default:
unreachable("Unknown query type");
@@ -1602,6 +1650,9 @@ brw_get_perf_query_data(struct gl_context *ctx,
written = get_pipeline_stats_data(brw, obj, data_size, (uint8_t *)data);
break;
+ case NULL_RENDERER:
+ break;
+
default:
unreachable("Unknown query type");
break;
@@ -1672,6 +1723,9 @@ brw_delete_perf_query(struct gl_context *ctx,
}
break;
+ case NULL_RENDERER:
+ break;
+
default:
unreachable("Unknown query type");
break;
@@ -2152,6 +2206,15 @@ get_register_queries_function(const struct gen_device_info *devinfo)
return NULL;
}
+static void
+fill_null_renderer_perf_query_info(struct brw_context *brw,
+ struct brw_perf_query_info *query)
+{
+ query->kind = NULL_RENDERER;
+ query->name = "Intel_Null_Hardware_Query";
+ query->n_counters = 0;
+}
+
static unsigned
brw_init_perf_query_info(struct gl_context *ctx)
{
@@ -2210,6 +2273,10 @@ brw_init_perf_query_info(struct gl_context *ctx)
enumerate_sysfs_metrics(brw);
brw_perf_query_register_mdapi_oa_query(brw);
+
+ struct brw_perf_query_info *null_query =
+ brw_perf_query_append_query_info(brw);
+ fill_null_renderer_perf_query_info(brw, null_query);
}
brw->perfquery.unaccumulated =
diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
index 9ab25cf664c..841b7df896d 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -42,7 +42,8 @@
#include "compiler/glsl/ir.h"
#include "compiler/glsl/program.h"
#include "compiler/glsl/glsl_to_nir.h"
-#include "compiler/glsl/float64_glsl.h"
+#include "compiler/glsl/gl_nir.h"
+#include "glsl/float64_glsl.h"
#include "brw_program.h"
#include "brw_context.h"
@@ -165,6 +166,9 @@ brw_create_nir(struct brw_context *brw,
nir = brw_preprocess_nir(brw->screen->compiler, nir);
+ if (shader_prog)
+ NIR_PASS_V(nir, gl_nir_lower_samplers, shader_prog);
+
NIR_PASS_V(nir, brw_nir_lower_image_load_store, devinfo);
if (stage == MESA_SHADER_TESS_CTRL) {
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index 7bbb6166344..9f88d625d63 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -309,6 +309,7 @@ brw_populate_sampler_prog_key_data(struct gl_context *ctx,
const int s = u_bit_scan(&mask);
key->swizzles[s] = SWIZZLE_NOOP;
+ key->scale_factors[s] = 0.0f;
int unit_id = prog->SamplerUnits[s];
const struct gl_texture_unit *unit = &ctx->Texture.Unit[unit_id];
@@ -406,6 +407,10 @@ brw_populate_sampler_prog_key_data(struct gl_context *ctx,
}
if (t->Target == GL_TEXTURE_EXTERNAL_OES && intel_tex->planar_format) {
+
+ /* Setup possible scaling factor. */
+ key->scale_factors[s] = intel_tex->planar_format->scaling_factor;
+
switch (intel_tex->planar_format->components) {
case __DRI_IMAGE_COMPONENTS_Y_UV:
key->y_uv_image_mask |= 1 << s;
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index b067a174056..8269056c74c 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -1681,6 +1681,11 @@ brw_upload_cs_work_groups_surface(struct brw_context *brw)
ISL_FORMAT_RAW,
3 * sizeof(GLuint), 1,
RELOC_WRITE);
+
+ /* The state buffer now holds a reference to our upload, drop ours. */
+ if (bo != brw->compute.num_work_groups_bo)
+ brw_bo_unreference(bo);
+
brw->ctx.NewDriverState |= BRW_NEW_SURFACES;
}
}
diff --git a/src/mesa/drivers/dri/i965/genX_state_upload.c b/src/mesa/drivers/dri/i965/genX_state_upload.c
index dcdfb3c9292..73c983ce742 100644
--- a/src/mesa/drivers/dri/i965/genX_state_upload.c
+++ b/src/mesa/drivers/dri/i965/genX_state_upload.c
@@ -998,7 +998,8 @@ genX(emit_index_buffer)(struct brw_context *brw)
brw_batch_emit(brw, GENX(3DSTATE_INDEX_BUFFER), ib) {
#if GEN_GEN < 8 && !GEN_IS_HASWELL
- ib.CutIndexEnable = brw->prim_restart.enable_cut_index;
+ assert(brw->ib.enable_cut_index == brw->prim_restart.enable_cut_index);
+ ib.CutIndexEnable = brw->ib.enable_cut_index;
#endif
ib.IndexFormat = brw_get_index_type(index_buffer->index_size);
@@ -2445,7 +2446,7 @@ set_scissor_bits(const struct gl_context *ctx, int i,
bbox[0] = MAX2(ctx->ViewportArray[i].X, 0);
bbox[1] = MIN2(bbox[0] + ctx->ViewportArray[i].Width, fb_width);
- bbox[2] = MAX2(ctx->ViewportArray[i].Y, 0);
+ bbox[2] = CLAMP(ctx->ViewportArray[i].Y, 0, fb_height);
bbox[3] = MIN2(bbox[2] + ctx->ViewportArray[i].Height, fb_height);
_mesa_intersect_scissor_bounding_box(ctx, i, bbox);
diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c
index 7513d15c3dd..92ecd612006 100644
--- a/src/mesa/drivers/dri/i965/intel_extensions.c
+++ b/src/mesa/drivers/dri/i965/intel_extensions.c
@@ -182,14 +182,16 @@ intelInitExtensions(struct gl_context *ctx)
ctx->Extensions.ARB_conditional_render_inverted = true;
ctx->Extensions.ARB_cull_distance = true;
ctx->Extensions.ARB_draw_buffers_blend = true;
- if (ctx->API != API_OPENGL_COMPAT)
+ if (ctx->API != API_OPENGL_COMPAT ||
+ ctx->Const.AllowHigherCompatVersion)
ctx->Extensions.ARB_enhanced_layouts = true;
ctx->Extensions.ARB_ES3_compatibility = true;
ctx->Extensions.ARB_fragment_layer_viewport = true;
ctx->Extensions.ARB_pipeline_statistics_query = true;
ctx->Extensions.ARB_sample_shading = true;
ctx->Extensions.ARB_shading_language_420pack = true;
- if (ctx->API != API_OPENGL_COMPAT) {
+ if (ctx->API != API_OPENGL_COMPAT ||
+ ctx->Const.AllowHigherCompatVersion) {
ctx->Extensions.ARB_texture_buffer_object = true;
ctx->Extensions.ARB_texture_buffer_object_rgb32 = true;
ctx->Extensions.ARB_texture_buffer_range = true;
@@ -199,7 +201,8 @@ intelInitExtensions(struct gl_context *ctx)
ctx->Extensions.ARB_texture_multisample = true;
ctx->Extensions.ARB_uniform_buffer_object = true;
- if (ctx->API != API_OPENGL_COMPAT)
+ if (ctx->API != API_OPENGL_COMPAT ||
+ ctx->Const.AllowHigherCompatVersion)
ctx->Extensions.AMD_vertex_shader_layer = true;
ctx->Extensions.EXT_framebuffer_multisample = true;
ctx->Extensions.EXT_framebuffer_multisample_blit_scaled = true;
@@ -228,7 +231,8 @@ intelInitExtensions(struct gl_context *ctx)
ctx->Extensions.ARB_conservative_depth = true;
ctx->Extensions.ARB_derivative_control = true;
ctx->Extensions.ARB_framebuffer_no_attachments = true;
- if (ctx->API != API_OPENGL_COMPAT) {
+ if (ctx->API != API_OPENGL_COMPAT ||
+ ctx->Const.AllowHigherCompatVersion) {
ctx->Extensions.ARB_gpu_shader5 = true;
ctx->Extensions.ARB_gpu_shader_fp64 = true;
}
@@ -239,7 +243,8 @@ intelInitExtensions(struct gl_context *ctx)
ctx->Extensions.ARB_shader_image_size = true;
ctx->Extensions.ARB_shader_precision = true;
ctx->Extensions.ARB_shader_texture_image_samples = true;
- if (ctx->API != API_OPENGL_COMPAT)
+ if (ctx->API != API_OPENGL_COMPAT ||
+ ctx->Const.AllowHigherCompatVersion)
ctx->Extensions.ARB_tessellation_shader = true;
ctx->Extensions.ARB_texture_compression_bptc = true;
ctx->Extensions.ARB_texture_view = true;
@@ -248,7 +253,6 @@ intelInitExtensions(struct gl_context *ctx)
ctx->Extensions.EXT_shader_samples_identical = true;
ctx->Extensions.OES_primitive_bounding_box = true;
ctx->Extensions.OES_texture_buffer = true;
- ctx->Extensions.ARB_fragment_shader_interlock = true;
if (can_do_pipelined_register_writes(brw->screen)) {
ctx->Extensions.ARB_draw_indirect = true;
@@ -313,6 +317,30 @@ intelInitExtensions(struct gl_context *ctx)
ctx->Extensions.KHR_blend_equation_advanced_coherent = true;
ctx->Extensions.KHR_texture_compression_astc_ldr = true;
ctx->Extensions.KHR_texture_compression_astc_sliced_3d = true;
+
+ /*
+ * From the Skylake PRM Vol. 7 (Memory Fence Message, page 221):
+ * "A memory fence message issued by a thread causes further messages
+ * issued by the thread to be blocked until all previous data port
+ * messages have completed, or the results can be globally observed from
+ * the point of view of other threads in the system."
+ *
+ * From the Haswell PRM Vol. 7 (Memory Fence, page 256):
+ * "A memory fence message issued by a thread causes further messages
+ * issued by the thread to be blocked until all previous messages issued
+ * by the thread to that data port (data cache or render cache) have
+ * been globally observed from the point of view of other threads in the
+ * system."
+ *
+ * Summarized: For ARB_fragment_shader_interlock to work, we need to
+ * ensure memory access ordering for all messages to the dataport from
+ * all threads. Memory fence messages prior to SKL only provide memory
+ * access ordering for messages from the same thread, so we can only
+ * support the feature from Gen9 onwards.
+ *
+ */
+
+ ctx->Extensions.ARB_fragment_shader_interlock = true;
}
if (gen_device_info_is_9lp(devinfo))
@@ -321,7 +349,8 @@ intelInitExtensions(struct gl_context *ctx)
if (devinfo->gen >= 6)
ctx->Extensions.INTEL_performance_query = true;
- if (ctx->API != API_OPENGL_COMPAT)
+ if (ctx->API != API_OPENGL_COMPAT ||
+ ctx->Const.AllowHigherCompatVersion)
ctx->Extensions.ARB_base_instance = true;
if (ctx->API != API_OPENGL_CORE)
ctx->Extensions.ARB_color_buffer_float = true;
diff --git a/src/mesa/drivers/dri/i965/intel_image.h b/src/mesa/drivers/dri/i965/intel_image.h
index ca604159dc2..4ab8a49b8bb 100644
--- a/src/mesa/drivers/dri/i965/intel_image.h
+++ b/src/mesa/drivers/dri/i965/intel_image.h
@@ -62,6 +62,7 @@ struct intel_image_format {
uint32_t dri_format;
int cpp;
} planes[3];
+ float scaling_factor;
};
struct __DRIimageRec {
diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
index 8838f977bb6..2436f48a065 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -61,6 +61,33 @@ DRI_CONF_BEGIN
DRI_CONF_ENUM(1, "Enable reuse of all sizes of buffer objects")
DRI_CONF_DESC_END
DRI_CONF_OPT_END
+
+ DRI_CONF_OPT_BEGIN_B(simd32_heuristic_grouped_check, "true")
+ DRI_CONF_DESC(en, "Enable/disable grouped texture fetch "
+ "check in the SIMD32 selection heuristic.")
+ DRI_CONF_OPT_END
+ DRI_CONF_OPT_BEGIN_V(simd32_heuristic_grouped_sends, int, 6, "1:999")
+ DRI_CONF_DESC(en, "How many grouped texture fetches should "
+ "the SIMD32 selection heuristic allow.")
+ DRI_CONF_OPT_END
+ DRI_CONF_OPT_BEGIN_B(simd32_heuristic_inst_check, "true")
+ DRI_CONF_DESC(en, "Enable/disable SIMD32/SIMD16 instruction "
+ "count ratio check in the SIMD32 selection "
+ "heuristic.")
+ DRI_CONF_OPT_END
+ DRI_CONF_OPT_BEGIN_V(simd32_heuristic_inst_ratio, float, 2.3, "1:999")
+ DRI_CONF_DESC(en, "SIMD32/SIMD16 instruction count ratio "
+ "the SIMD32 selection heuristic should allow.")
+ DRI_CONF_OPT_END
+ DRI_CONF_OPT_BEGIN_B(simd32_heuristic_mrt_check, "true")
+ DRI_CONF_DESC(en, "Enable/disable MRT write check in the "
+ "SIMD32 selection heuristic.")
+ DRI_CONF_OPT_END
+ DRI_CONF_OPT_BEGIN_V(simd32_heuristic_max_mrts, int, 1, "1:8")
+ DRI_CONF_DESC(en, "How many MRT writes should the SIMD32 "
+ "selection heuristic allow.")
+ DRI_CONF_OPT_END
+
DRI_CONF_MESA_NO_ERROR("false")
DRI_CONF_SECTION_END
@@ -282,6 +309,18 @@ static const struct intel_image_format intel_image_formats[] = {
{ { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 },
{ 1, 1, 1, __DRI_IMAGE_FORMAT_GR88, 2 } } },
+ { __DRI_IMAGE_FOURCC_P010, __DRI_IMAGE_COMPONENTS_Y_UV, 2,
+ { { 0, 0, 0, __DRI_IMAGE_FORMAT_R16, 2 },
+ { 1, 1, 1, __DRI_IMAGE_FORMAT_GR1616, 4 } } },
+
+ { __DRI_IMAGE_FOURCC_P012, __DRI_IMAGE_COMPONENTS_Y_UV, 2,
+ { { 0, 0, 0, __DRI_IMAGE_FORMAT_R16, 2 },
+ { 1, 1, 1, __DRI_IMAGE_FORMAT_GR1616, 4 } } },
+
+ { __DRI_IMAGE_FOURCC_P016, __DRI_IMAGE_COMPONENTS_Y_UV, 2,
+ { { 0, 0, 0, __DRI_IMAGE_FORMAT_R16, 2 },
+ { 1, 1, 1, __DRI_IMAGE_FORMAT_GR1616, 4 } } },
+
{ __DRI_IMAGE_FOURCC_NV16, __DRI_IMAGE_COMPONENTS_Y_UV, 2,
{ { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 },
{ 1, 1, 0, __DRI_IMAGE_FORMAT_GR88, 2 } } },
diff --git a/src/mesa/drivers/dri/i965/meson.build b/src/mesa/drivers/dri/i965/meson.build
index cd3683ae7ec..0bc6125f19c 100644
--- a/src/mesa/drivers/dri/i965/meson.build
+++ b/src/mesa/drivers/dri/i965/meson.build
@@ -187,7 +187,7 @@ libi965 = static_library(
i965_gen_libs, libintel_common, libintel_dev, libisl, libintel_compiler,
libblorp
],
- dependencies : [dep_libdrm, dep_valgrind, idep_nir_headers],
+ dependencies : [dep_libdrm, dep_valgrind, idep_nir_headers, idep_genxml],
)
dri_drivers += libi965
diff --git a/src/mesa/drivers/dri/meson.build b/src/mesa/drivers/dri/meson.build
index d98c823f5fe..dddc4ae3dfd 100644
--- a/src/mesa/drivers/dri/meson.build
+++ b/src/mesa/drivers/dri/meson.build
@@ -54,6 +54,10 @@ if dri_drivers != []
dep_selinux, dep_libdrm, dep_expat, dep_m, dep_thread, dep_dl, idep_nir,
],
link_args : [ld_args_build_id, ld_args_bsymbolic, ld_args_gc_sections],
+ # Will be deleted during installation, see install_megadrivers.py
+ install : true,
+ install_dir : dri_drivers_path,
+ name_suffix : 'so',
)
meson.add_install_script(
@@ -78,7 +82,7 @@ if with_dri
filebase : 'dri',
description : 'Direct Rendering Infrastructure',
version : meson.project_version(),
- variables : ['dridriverdir=${prefix}/' + dri_drivers_path],
+ variables : ['dridriverdir=' + dri_drivers_path],
requires_private : dri_req_private,
)
endif
diff --git a/src/mesa/drivers/osmesa/meson.build b/src/mesa/drivers/osmesa/meson.build
index a406bb3c210..c479b740131 100644
--- a/src/mesa/drivers/osmesa/meson.build
+++ b/src/mesa/drivers/osmesa/meson.build
@@ -33,7 +33,8 @@ libosmesa = shared_library(
include_directories : [
inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux,
],
- link_with : [libmesa_classic, libglapi_static, osmesa_link_with],
+ link_whole : libglapi_static,
+ link_with : [libmesa_classic, osmesa_link_with],
dependencies : [dep_thread, dep_selinux],
version : '8.0.0',
install : true,
diff --git a/src/mesa/main/dlist.c b/src/mesa/main/dlist.c
index 97461cede34..eb22fcbdb31 100644
--- a/src/mesa/main/dlist.c
+++ b/src/mesa/main/dlist.c
@@ -962,6 +962,8 @@ make_list(GLuint name, GLuint count)
dlist->Name = name;
dlist->Head = malloc(sizeof(Node) * count);
dlist->Head[0].opcode = OPCODE_END_OF_LIST;
+ /* All InstSize[] entries must be non-zero */
+ InstSize[OPCODE_END_OF_LIST] = 1;
return dlist;
}
@@ -2753,6 +2755,7 @@ save_Fogiv(GLenum pname, const GLint *params)
case GL_FOG_START:
case GL_FOG_END:
case GL_FOG_INDEX:
+ case GL_FOG_COORDINATE_SOURCE:
p[0] = (GLfloat) *params;
p[1] = 0.0f;
p[2] = 0.0f;
diff --git a/src/mesa/main/errors.c b/src/mesa/main/errors.c
index a9687913627..30560ba047e 100644
--- a/src/mesa/main/errors.c
+++ b/src/mesa/main/errors.c
@@ -231,6 +231,9 @@ _mesa_gl_vdebug(struct gl_context *ctx,
_mesa_debug_get_id(id);
len = _mesa_vsnprintf(s, MAX_DEBUG_MESSAGE_LENGTH, fmtString, args);
+ if (len >= MAX_DEBUG_MESSAGE_LENGTH)
+ /* message was truncated */
+ len = MAX_DEBUG_MESSAGE_LENGTH - 1;
_mesa_log_msg(ctx, source, type, *id, severity, len, s);
}
diff --git a/src/mesa/main/fbobject.c b/src/mesa/main/fbobject.c
index 8290ea94dfc..341fd93efc6 100644
--- a/src/mesa/main/fbobject.c
+++ b/src/mesa/main/fbobject.c
@@ -4663,8 +4663,12 @@ get_fb_attachment(struct gl_context *ctx, struct gl_framebuffer *fb,
case GL_COLOR_ATTACHMENT12:
case GL_COLOR_ATTACHMENT13:
case GL_COLOR_ATTACHMENT14:
- case GL_COLOR_ATTACHMENT15:
- return &fb->Attachment[BUFFER_COLOR0 + attachment - GL_COLOR_ATTACHMENT0];
+ case GL_COLOR_ATTACHMENT15: {
+ const unsigned i = attachment - GL_COLOR_ATTACHMENT0;
+ if (i >= ctx->Const.MaxColorAttachments)
+ return NULL;
+ return &fb->Attachment[BUFFER_COLOR0 + i];
+ }
case GL_DEPTH:
case GL_DEPTH_ATTACHMENT:
case GL_DEPTH_STENCIL_ATTACHMENT:
@@ -4691,6 +4695,29 @@ discard_framebuffer(struct gl_context *ctx, struct gl_framebuffer *fb,
if (!att)
continue;
+ /* If we're asked to invalidate just depth or just stencil, but the
+ * attachment is packed depth/stencil, then we can only use
+ * Driver.DiscardFramebuffer if the attachments list includes both depth
+ * and stencil and they both point at the same renderbuffer.
+ */
+ if ((attachments[i] == GL_DEPTH_ATTACHMENT ||
+ attachments[i] == GL_STENCIL_ATTACHMENT) &&
+ (!att->Renderbuffer ||
+ att->Renderbuffer->_BaseFormat == GL_DEPTH_STENCIL)) {
+ GLenum other_format = (attachments[i] == GL_DEPTH_ATTACHMENT ?
+ GL_STENCIL_ATTACHMENT : GL_DEPTH_ATTACHMENT);
+ bool has_both = false;
+ for (int j = 0; j < numAttachments; j++) {
+ if (attachments[j] == other_format)
+ has_both = true;
+ break;
+ }
+
+ if (fb->Attachment[BUFFER_DEPTH].Renderbuffer !=
+ fb->Attachment[BUFFER_STENCIL].Renderbuffer || !has_both)
+ continue;
+ }
+
ctx->Driver.DiscardFramebuffer(ctx, fb, att);
}
}
diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
index ee77c45d03c..efc9c11f79d 100644
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -906,6 +906,9 @@ find_custom_value(struct gl_context *ctx, const struct value_desc *d, union valu
break;
/* GL_EXT_external_objects */
+ case GL_NUM_DEVICE_UUIDS_EXT:
+ v->value_int = 1;
+ break;
case GL_DRIVER_UUID_EXT:
_mesa_get_driver_uuid(ctx, v->value_int_4);
break;
diff --git a/src/mesa/program/Android.mk b/src/mesa/program/Android.mk
index c6470e6289e..13d0da85882 100644
--- a/src/mesa/program/Android.mk
+++ b/src/mesa/program/Android.mk
@@ -41,7 +41,7 @@ endef
include $(MESA_TOP)/src/mesa/Makefile.sources
include $(CLEAR_VARS)
-
+LOCAL_CFLAGS += -Wno-error
LOCAL_MODULE := libmesa_program
LOCAL_MODULE_CLASS := STATIC_LIBRARIES
LOCAL_STATIC_LIBRARIES := libmesa_nir \
diff --git a/src/mesa/program/ir_to_mesa.cpp b/src/mesa/program/ir_to_mesa.cpp
index 2908819d28e..1af7921ec32 100644
--- a/src/mesa/program/ir_to_mesa.cpp
+++ b/src/mesa/program/ir_to_mesa.cpp
@@ -2511,8 +2511,7 @@ _mesa_generate_parameters_list_for_uniforms(struct gl_context *ctx,
void
_mesa_associate_uniform_storage(struct gl_context *ctx,
struct gl_shader_program *shader_program,
- struct gl_program *prog,
- bool propagate_to_storage)
+ struct gl_program *prog)
{
struct gl_program_parameter_list *params = prog->Parameters;
gl_shader_stage shader_type = prog->info.stage;
@@ -2638,26 +2637,24 @@ _mesa_associate_uniform_storage(struct gl_context *ctx,
* data from the linker's backing store. This will cause values from
* initializers in the source code to be copied over.
*/
- if (propagate_to_storage) {
- unsigned array_elements = MAX2(1, storage->array_elements);
- if (ctx->Const.PackedDriverUniformStorage && !prog->is_arb_asm &&
- (storage->is_bindless || !storage->type->contains_opaque())) {
- const int dmul = storage->type->is_64bit() ? 2 : 1;
- const unsigned components =
- storage->type->vector_elements *
- storage->type->matrix_columns;
-
- for (unsigned s = 0; s < storage->num_driver_storage; s++) {
- gl_constant_value *uni_storage = (gl_constant_value *)
- storage->driver_storage[s].data;
- memcpy(uni_storage, storage->storage,
- sizeof(storage->storage[0]) * components *
- array_elements * dmul);
- }
- } else {
- _mesa_propagate_uniforms_to_driver_storage(storage, 0,
- array_elements);
+ unsigned array_elements = MAX2(1, storage->array_elements);
+ if (ctx->Const.PackedDriverUniformStorage && !prog->is_arb_asm &&
+ (storage->is_bindless || !storage->type->contains_opaque())) {
+ const int dmul = storage->type->is_64bit() ? 2 : 1;
+ const unsigned components =
+ storage->type->vector_elements *
+ storage->type->matrix_columns;
+
+ for (unsigned s = 0; s < storage->num_driver_storage; s++) {
+ gl_constant_value *uni_storage = (gl_constant_value *)
+ storage->driver_storage[s].data;
+ memcpy(uni_storage, storage->storage,
+ sizeof(storage->storage[0]) * components *
+ array_elements * dmul);
}
+ } else {
+ _mesa_propagate_uniforms_to_driver_storage(storage, 0,
+ array_elements);
}
last_location = location;
@@ -3016,7 +3013,7 @@ get_mesa_program(struct gl_context *ctx,
* prog->ParameterValues to get reallocated (e.g., anything that adds a
* program constant) has to happen before creating this linkage.
*/
- _mesa_associate_uniform_storage(ctx, shader_program, prog, true);
+ _mesa_associate_uniform_storage(ctx, shader_program, prog);
if (!shader_program->data->LinkStatus) {
goto fail_exit;
}
diff --git a/src/mesa/program/ir_to_mesa.h b/src/mesa/program/ir_to_mesa.h
index f5665e6316e..33eb801bae8 100644
--- a/src/mesa/program/ir_to_mesa.h
+++ b/src/mesa/program/ir_to_mesa.h
@@ -50,8 +50,7 @@ _mesa_generate_parameters_list_for_uniforms(struct gl_context *ctx,
void
_mesa_associate_uniform_storage(struct gl_context *ctx,
struct gl_shader_program *shader_program,
- struct gl_program *prog,
- bool propagate_to_storage);
+ struct gl_program *prog);
#ifdef __cplusplus
}
diff --git a/src/mesa/program/prog_parameter.c b/src/mesa/program/prog_parameter.c
index 2bc1b6db6eb..4073030f536 100644
--- a/src/mesa/program/prog_parameter.c
+++ b/src/mesa/program/prog_parameter.c
@@ -271,6 +271,7 @@ _mesa_add_parameter(struct gl_program_parameter_list *paramList,
p->Name = strdup(name ? name : "");
p->Type = type;
p->Size = size;
+ p->Padded = pad_and_align;
p->DataType = datatype;
paramList->ParameterValueOffset[oldNum] = oldValNum;
diff --git a/src/mesa/program/prog_parameter.h b/src/mesa/program/prog_parameter.h
index cc551c18910..d3d5961f920 100644
--- a/src/mesa/program/prog_parameter.h
+++ b/src/mesa/program/prog_parameter.h
@@ -104,6 +104,12 @@ struct gl_program_parameter
* A sequence of STATE_* tokens and integers to identify GL state.
*/
gl_state_index16 StateIndexes[STATE_LENGTH];
+
+ /**
+ * We need to keep track of whether the param is padded for use in the
+ * shader cache.
+ */
+ bool Padded;
};
diff --git a/src/mesa/state_tracker/st_cb_rasterpos.c b/src/mesa/state_tracker/st_cb_rasterpos.c
index fecaaf77da8..c54b50dc754 100644
--- a/src/mesa/state_tracker/st_cb_rasterpos.c
+++ b/src/mesa/state_tracker/st_cb_rasterpos.c
@@ -208,6 +208,10 @@ new_draw_rastpos_stage(struct gl_context *ctx, struct draw_context *draw)
rs->prim.end = 1;
rs->prim.start = 0;
rs->prim.count = 1;
+ rs->prim.pad = 0;
+ rs->prim.num_instances = 1;
+ rs->prim.base_instance = 0;
+ rs->prim.is_indirect = 0;
return rs;
}
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index 0a0bd8ba1ca..57f76fc1ce0 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -479,7 +479,7 @@ st_create_context_priv(struct gl_context *ctx, struct pipe_context *pipe,
PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET));
/* GL limits and extensions */
- st_init_limits(pipe->screen, &ctx->Const, &ctx->Extensions, ctx->API);
+ st_init_limits(pipe->screen, &ctx->Const, &ctx->Extensions);
st_init_extensions(pipe->screen, &ctx->Const,
&ctx->Extensions, &st->options, ctx->API);
diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index 1e456d019d0..dd0320e5b9b 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -76,8 +76,7 @@ static int _clamp(int a, int min, int max)
* Note that we have to limit/clamp against Mesa's internal limits too.
*/
void st_init_limits(struct pipe_screen *screen,
- struct gl_constants *c, struct gl_extensions *extensions,
- gl_api api)
+ struct gl_constants *c, struct gl_extensions *extensions)
{
int supported_irs;
unsigned sh;
@@ -223,8 +222,13 @@ void st_init_limits(struct pipe_screen *screen,
pc->MaxUniformComponents = MIN2(pc->MaxUniformComponents,
MAX_UNIFORMS * 4);
+ /* For ARB programs, prog_src_register::Index is a signed 13-bit number.
+ * This gives us a limit of 4096 values - but we may need to generate
+ * internal values in addition to what the source program uses. So, we
+ * drop the limit one step lower, to 2048, to be safe.
+ */
pc->MaxParameters =
- pc->MaxNativeParameters = pc->MaxUniformComponents / 4;
+ pc->MaxNativeParameters = MIN2(pc->MaxUniformComponents / 4, 2048);
pc->MaxInputComponents =
screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_INPUTS) * 4;
pc->MaxOutputComponents =
@@ -362,10 +366,7 @@ void st_init_limits(struct pipe_screen *screen,
c->Program[MESA_SHADER_VERTEX].MaxAttribs =
MIN2(c->Program[MESA_SHADER_VERTEX].MaxAttribs, 16);
- /* PIPE_SHADER_CAP_MAX_INPUTS for the FS specifies the maximum number
- * of inputs. It's always 2 colors + N generic inputs. */
- c->MaxVarying = screen->get_shader_param(screen, PIPE_SHADER_FRAGMENT,
- PIPE_SHADER_CAP_MAX_INPUTS);
+ c->MaxVarying = screen->get_param(screen, PIPE_CAP_MAX_VARYINGS);
c->MaxVarying = MIN2(c->MaxVarying, MAX_VARYING);
c->MaxGeometryOutputVertices =
screen->get_param(screen, PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES);
@@ -438,14 +439,8 @@ void st_init_limits(struct pipe_screen *screen,
c->GLSLFrontFacingIsSysVal =
screen->get_param(screen, PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL);
- /* GL_ARB_get_program_binary
- *
- * The QT framework has a bug in their shader program cache, which is built
- * on GL_ARB_get_program_binary. In an effort to allow them to fix the bug
- * we don't enable more than 1 binary format for compatibility profiles.
- */
- if (api != API_OPENGL_COMPAT &&
- screen->get_disk_shader_cache && screen->get_disk_shader_cache(screen))
+ /* GL_ARB_get_program_binary */
+ if (screen->get_disk_shader_cache && screen->get_disk_shader_cache(screen))
c->NumProgramBinaryFormats = 1;
c->MaxAtomicBufferBindings =
diff --git a/src/mesa/state_tracker/st_extensions.h b/src/mesa/state_tracker/st_extensions.h
index fdfac7ece70..7bf1aa8c8cb 100644
--- a/src/mesa/state_tracker/st_extensions.h
+++ b/src/mesa/state_tracker/st_extensions.h
@@ -35,8 +35,7 @@ struct pipe_screen;
extern void st_init_limits(struct pipe_screen *screen,
struct gl_constants *c,
- struct gl_extensions *extensions,
- gl_api api);
+ struct gl_extensions *extensions);
extern void st_init_extensions(struct pipe_screen *screen,
struct gl_constants *consts,
diff --git a/src/mesa/state_tracker/st_format.c b/src/mesa/state_tracker/st_format.c
index aacb8788287..febde1a5e97 100644
--- a/src/mesa/state_tracker/st_format.c
+++ b/src/mesa/state_tracker/st_format.c
@@ -2356,6 +2356,8 @@ st_ChooseTextureFormat(struct gl_context *ctx, GLenum target,
bindings |= PIPE_BIND_DEPTH_STENCIL;
else if (is_renderbuffer || internalFormat == 3 || internalFormat == 4 ||
internalFormat == GL_RGB || internalFormat == GL_RGBA ||
+ internalFormat == GL_RGBA2 ||
+ internalFormat == GL_RGB4 || internalFormat == GL_RGBA4 ||
internalFormat == GL_RGB8 || internalFormat == GL_RGBA8 ||
internalFormat == GL_BGRA ||
internalFormat == GL_RGB16F ||
diff --git a/src/mesa/state_tracker/st_glsl_to_nir.cpp b/src/mesa/state_tracker/st_glsl_to_nir.cpp
index d7f2e3e6eaa..88506715e38 100644
--- a/src/mesa/state_tracker/st_glsl_to_nir.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_nir.cpp
@@ -327,7 +327,7 @@ st_nir_opts(nir_shader *nir, bool scalar)
NIR_PASS(progress, nir, nir_opt_if);
NIR_PASS(progress, nir, nir_opt_dead_cf);
NIR_PASS(progress, nir, nir_opt_cse);
- NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true);
+ NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true);
NIR_PASS(progress, nir, nir_opt_algebraic);
NIR_PASS(progress, nir, nir_opt_constant_folding);
@@ -456,7 +456,7 @@ st_glsl_to_nir_post_opts(struct st_context *st, struct gl_program *prog,
* prog->ParameterValues to get reallocated (e.g., anything that adds a
* program constant) has to happen before creating this linkage.
*/
- _mesa_associate_uniform_storage(st->ctx, shader_program, prog, true);
+ _mesa_associate_uniform_storage(st->ctx, shader_program, prog);
st_set_prog_affected_state_flags(prog);
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 2102b7a57d5..060648d76a2 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -7201,7 +7201,7 @@ get_mesa_program_tgsi(struct gl_context *ctx,
* prog->ParameterValues to get reallocated (e.g., anything that adds a
* program constant) has to happen before creating this linkage.
*/
- _mesa_associate_uniform_storage(ctx, shader_program, prog, true);
+ _mesa_associate_uniform_storage(ctx, shader_program, prog);
if (!shader_program->data->LinkStatus) {
free_glsl_to_tgsi_visitor(v);
_mesa_reference_program(ctx, &shader->Program, NULL);
diff --git a/src/mesa/state_tracker/st_manager.c b/src/mesa/state_tracker/st_manager.c
index 5efbd1fa1d2..d16a4c1df40 100644
--- a/src/mesa/state_tracker/st_manager.c
+++ b/src/mesa/state_tracker/st_manager.c
@@ -1105,10 +1105,17 @@ st_api_make_current(struct st_api *stapi, struct st_context_iface *stctxi,
else {
GET_CURRENT_CONTEXT(ctx);
- ret = _mesa_make_current(NULL, NULL, NULL);
-
- if (ctx)
+ if (ctx) {
+ /* Before releasing the context, release its associated
+ * winsys buffers first. Then purge the context's winsys buffers list
+ * to free the resources of any winsys buffers that no longer have
+ * an existing drawable.
+ */
+ ret = _mesa_make_current(ctx, NULL, NULL);
st_framebuffers_purge(ctx->st);
+ }
+
+ ret = _mesa_make_current(NULL, NULL, NULL);
}
return ret;
@@ -1252,7 +1259,7 @@ get_version(struct pipe_screen *screen,
_mesa_init_constants(&consts, api);
_mesa_init_extensions(&extensions);
- st_init_limits(screen, &consts, &extensions, api);
+ st_init_limits(screen, &consts, &extensions);
st_init_extensions(screen, &consts, &extensions, options, api);
return _mesa_get_version(&extensions, &consts, api);
diff --git a/src/mesa/state_tracker/st_shader_cache.c b/src/mesa/state_tracker/st_shader_cache.c
index c82ce3eaa2d..894816ab16f 100644
--- a/src/mesa/state_tracker/st_shader_cache.c
+++ b/src/mesa/state_tracker/st_shader_cache.c
@@ -365,7 +365,7 @@ st_deserialise_ir_program(struct gl_context *ctx,
}
st_set_prog_affected_state_flags(prog);
- _mesa_associate_uniform_storage(ctx, shProg, prog, false);
+ _mesa_associate_uniform_storage(ctx, shProg, prog);
/* Create Gallium shaders now instead of on demand. */
if (ST_DEBUG & DEBUG_PRECOMPILE ||
diff --git a/src/mesa/state_tracker/st_tgsi_lower_yuv.c b/src/mesa/state_tracker/st_tgsi_lower_yuv.c
index 6acd173adc9..73437ddda70 100644
--- a/src/mesa/state_tracker/st_tgsi_lower_yuv.c
+++ b/src/mesa/state_tracker/st_tgsi_lower_yuv.c
@@ -269,31 +269,39 @@ yuv_to_rgb(struct tgsi_transform_context *tctx,
tctx->emit_instruction(tctx, &inst);
/* DP3 dst.x, tmpA, imm[0] */
- inst = dp3_instruction();
- reg_dst(&inst.Dst[0], dst, TGSI_WRITEMASK_X);
- reg_src(&inst.Src[0], &ctx->tmp[A].src, SWIZ(X, Y, Z, W));
- reg_src(&inst.Src[1], &ctx->imm[0], SWIZ(X, Y, Z, W));
- tctx->emit_instruction(tctx, &inst);
+ if (dst->Register.WriteMask & TGSI_WRITEMASK_X) {
+ inst = dp3_instruction();
+ reg_dst(&inst.Dst[0], dst, TGSI_WRITEMASK_X);
+ reg_src(&inst.Src[0], &ctx->tmp[A].src, SWIZ(X, Y, Z, W));
+ reg_src(&inst.Src[1], &ctx->imm[0], SWIZ(X, Y, Z, W));
+ tctx->emit_instruction(tctx, &inst);
+ }
/* DP3 dst.y, tmpA, imm[1] */
- inst = dp3_instruction();
- reg_dst(&inst.Dst[0], dst, TGSI_WRITEMASK_Y);
- reg_src(&inst.Src[0], &ctx->tmp[A].src, SWIZ(X, Y, Z, W));
- reg_src(&inst.Src[1], &ctx->imm[1], SWIZ(X, Y, Z, W));
- tctx->emit_instruction(tctx, &inst);
+ if (dst->Register.WriteMask & TGSI_WRITEMASK_Y) {
+ inst = dp3_instruction();
+ reg_dst(&inst.Dst[0], dst, TGSI_WRITEMASK_Y);
+ reg_src(&inst.Src[0], &ctx->tmp[A].src, SWIZ(X, Y, Z, W));
+ reg_src(&inst.Src[1], &ctx->imm[1], SWIZ(X, Y, Z, W));
+ tctx->emit_instruction(tctx, &inst);
+ }
/* DP3 dst.z, tmpA, imm[2] */
- inst = dp3_instruction();
- reg_dst(&inst.Dst[0], dst, TGSI_WRITEMASK_Z);
- reg_src(&inst.Src[0], &ctx->tmp[A].src, SWIZ(X, Y, Z, W));
- reg_src(&inst.Src[1], &ctx->imm[2], SWIZ(X, Y, Z, W));
- tctx->emit_instruction(tctx, &inst);
+ if (dst->Register.WriteMask & TGSI_WRITEMASK_Z) {
+ inst = dp3_instruction();
+ reg_dst(&inst.Dst[0], dst, TGSI_WRITEMASK_Z);
+ reg_src(&inst.Src[0], &ctx->tmp[A].src, SWIZ(X, Y, Z, W));
+ reg_src(&inst.Src[1], &ctx->imm[2], SWIZ(X, Y, Z, W));
+ tctx->emit_instruction(tctx, &inst);
+ }
/* MOV dst.w, imm[0].x */
- inst = mov_instruction();
- reg_dst(&inst.Dst[0], dst, TGSI_WRITEMASK_W);
- reg_src(&inst.Src[0], &ctx->imm[3], SWIZ(_, _, _, W));
- tctx->emit_instruction(tctx, &inst);
+ if (dst->Register.WriteMask & TGSI_WRITEMASK_W) {
+ inst = mov_instruction();
+ reg_dst(&inst.Dst[0], dst, TGSI_WRITEMASK_W);
+ reg_src(&inst.Src[0], &ctx->imm[3], SWIZ(_, _, _, W));
+ tctx->emit_instruction(tctx, &inst);
+ }
}
static void
@@ -434,7 +442,7 @@ st_tgsi_lower_yuv(const struct tgsi_token *tokens, unsigned free_slots,
/* TODO better job of figuring out how many extra tokens we need..
* this is a pain about tgsi_transform :-/
*/
- newlen = tgsi_num_tokens(tokens) + 120;
+ newlen = tgsi_num_tokens(tokens) + 300;
newtoks = tgsi_alloc_tokens(newlen);
if (!newtoks)
return NULL;
diff --git a/src/util/00-mesa-defaults.conf b/src/util/00-mesa-defaults.conf
index cb0e6e659e2..c38334140b6 100644
--- a/src/util/00-mesa-defaults.conf
+++ b/src/util/00-mesa-defaults.conf
@@ -98,6 +98,11 @@ TODO: document the other workarounds.
+
+
+
+
+
@@ -229,6 +234,9 @@ TODO: document the other workarounds.
+
+
+
@@ -346,6 +354,9 @@ TODO: document the other workarounds.
+
+
+
@@ -423,8 +434,17 @@ TODO: document the other workarounds.
+
+
+
+
+
+
+
+
+
diff --git a/src/util/Android.mk b/src/util/Android.mk
index 2d59e1ae15e..6d770ca9575 100644
--- a/src/util/Android.mk
+++ b/src/util/Android.mk
@@ -41,8 +41,14 @@ LOCAL_C_INCLUDES := \
$(MESA_TOP)/src/gallium/include \
$(MESA_TOP)/src/gallium/auxiliary
+# If Android version >=8 MESA should static link libexpat else should dynamic link
+ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0)
+LOCAL_STATIC_LIBRARIES := \
+ libexpat
+else
LOCAL_SHARED_LIBRARIES := \
libexpat
+endif
LOCAL_MODULE := libmesa_util
diff --git a/src/util/bitscan.h b/src/util/bitscan.h
index dc89ac93f28..cdfecafaf01 100644
--- a/src/util/bitscan.h
+++ b/src/util/bitscan.h
@@ -112,6 +112,31 @@ u_bit_scan64(uint64_t *mask)
return i;
}
+/* Count bits set in mask */
+static inline int
+u_count_bits(unsigned *mask)
+{
+ unsigned v = *mask;
+ int c;
+ v = v - ((v >> 1) & 0x55555555);
+ v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
+ v = (v + (v >> 4)) & 0xF0F0F0F;
+ c = (int)((v * 0x1010101) >> 24);
+ return c;
+}
+
+static inline int
+u_count_bits64(uint64_t *mask)
+{
+ uint64_t v = *mask;
+ int c;
+ v = v - ((v >> 1) & 0x5555555555555555ull);
+ v = (v & 0x3333333333333333ull) + ((v >> 2) & 0x3333333333333333ull);
+ v = (v + (v >> 4)) & 0xF0F0F0F0F0F0F0Full;
+ c = (int)((v * 0x101010101010101ull) >> 56);
+ return c;
+}
+
/* Determine if an unsigned value is a power of two.
*
* \note
diff --git a/src/util/bitset.h b/src/util/bitset.h
index 3b18abac793..7ccfffad45f 100644
--- a/src/util/bitset.h
+++ b/src/util/bitset.h
@@ -62,8 +62,8 @@
#define BITSET_SET(x, b) ((x)[BITSET_BITWORD(b)] |= BITSET_BIT(b))
#define BITSET_CLEAR(x, b) ((x)[BITSET_BITWORD(b)] &= ~BITSET_BIT(b))
-#define BITSET_MASK(b) ((b) == BITSET_WORDBITS ? ~0 : BITSET_BIT(b) - 1)
-#define BITSET_RANGE(b, e) (BITSET_MASK((e) + 1) & ~BITSET_MASK(b))
+#define BITSET_MASK(b) (((b) % BITSET_WORDBITS == 0) ? ~0 : BITSET_BIT(b) - 1)
+#define BITSET_RANGE(b, e) ((BITSET_MASK((e) + 1)) & ~(BITSET_BIT(b) - 1))
/* bit range operations
*/
diff --git a/src/util/merge_driinfo.py b/src/util/merge_driinfo.py
index e6ccca5e0f3..a09218a3902 100644
--- a/src/util/merge_driinfo.py
+++ b/src/util/merge_driinfo.py
@@ -52,6 +52,14 @@ def __init__(self, name, defaults):
self.defaults = defaults
+class Verbatim(object):
+ """
+ Represent a chunk of code that is copied into the result file verbatim.
+ """
+ def __init__(self):
+ self.string = ''
+
+
class Section(object):
"""
Represent a config section description as:
@@ -75,8 +83,29 @@ def parse_inputs(input_filenames):
section = None
linenum = 0
+ verbatim = None
for line in infile:
linenum += 1
+
+ if line.startswith('//= BEGIN VERBATIM'):
+ if verbatim is not None:
+ print('{}:{}: nested verbatim'
+ .format(input_filename, linenum))
+ success = False
+ continue
+ verbatim = Verbatim()
+
+ if verbatim is not None:
+ verbatim.string += line
+
+ if line.startswith('//= END VERBATIM'):
+ if section is None:
+ sections.append(verbatim)
+ else:
+ section.options.append(verbatim)
+ verbatim = None
+ continue
+
line = line.strip()
if not line:
continue
@@ -144,12 +173,17 @@ def merge_sections(section_list):
assert section.name == merged_section.name
for orig_option in section.options:
- for merged_option in merged_section.options:
- if orig_option.name == merged_option.name:
- merged_option.defaults = orig_option.defaults
- break
+ if isinstance(orig_option, Option):
+ for merged_option in merged_section.options:
+ if not isinstance(merged_option, Option):
+ continue
+ if orig_option.name == merged_option.name:
+ merged_option.defaults = orig_option.defaults
+ break
+ else:
+ merged_section.options.append(Option(orig_option.name, orig_option.defaults))
else:
- merged_section.options.append(Option(orig_option.name, orig_option.defaults))
+ merged_section.options.append(orig_option)
return merged_section
@@ -164,6 +198,10 @@ def merge_sections_lists(sections_lists):
for idx,sections in enumerate(sections_lists):
for base_section in sections:
+ if not isinstance(base_section, Section):
+ merged_sections.append(base_section)
+ continue
+
original_sections = [base_section]
for next_sections in sections_lists[idx+1:]:
for j,section in enumerate(next_sections):
@@ -201,15 +239,23 @@ def main(input_filenames):
DRI_CONF_BEGIN
% for section in sections:
+% if isinstance(section, Section):
DRI_CONF_SECTION_${section.name}
% for option in section.options:
+% if isinstance(option, Option):
DRI_CONF_${option.name}(${option.defaults})
+% else:
+${option.string}
+% endif
% endfor
DRI_CONF_SECTION_END
+% else:
+${section.string}
+% endif
% endfor
DRI_CONF_END""")
- print(driinfo_h_template.render(sections=merged_sections_list))
+ print(driinfo_h_template.render(sections=merged_sections_list, Section=Section, Option=Option))
return True
diff --git a/src/util/xmlpool/t_options.h b/src/util/xmlpool/t_options.h
index 80ddf0e203e..dd2b5c21760 100644
--- a/src/util/xmlpool/t_options.h
+++ b/src/util/xmlpool/t_options.h
@@ -338,12 +338,8 @@ DRI_CONF_OPT_BEGIN_B(radeonsi_commutative_blend_add, def) \
DRI_CONF_DESC(en,gettext("Commutative additive blending optimizations (may cause rendering errors)")) \
DRI_CONF_OPT_END
-#define DRI_CONF_RADEONSI_CLEAR_DB_CACHE_BEFORE_CLEAR(def) \
-DRI_CONF_OPT_BEGIN_B(radeonsi_clear_db_cache_before_clear, def) \
- DRI_CONF_DESC(en,"Clear DB cache before fast depth clear") \
-DRI_CONF_OPT_END
-
#define DRI_CONF_RADEONSI_ZERO_ALL_VRAM_ALLOCS(def) \
DRI_CONF_OPT_BEGIN_B(radeonsi_zerovram, def) \
DRI_CONF_DESC(en,"Zero all vram allocations") \
DRI_CONF_OPT_END
+
diff --git a/src/vulkan/Android.mk b/src/vulkan/Android.mk
index 6253f1c3be9..99dd3f56313 100644
--- a/src/vulkan/Android.mk
+++ b/src/vulkan/Android.mk
@@ -39,6 +39,14 @@ LOCAL_C_INCLUDES := \
$(MESA_TOP)/include/vulkan \
$(MESA_TOP)/src/vulkan/util
+ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0)
+LOCAL_C_INCLUDES += \
+ frameworks/native/libs/nativebase/include \
+ frameworks/native/libs/nativewindow/include \
+ frameworks/native/libs/arect/include
+LOCAL_HEADER_LIBRARIES += libcutils_headers libsystem_headers
+endif
+
LOCAL_GENERATED_SOURCES := $(addprefix $(intermediates)/, \
$(VULKAN_UTIL_GENERATED_FILES))
diff --git a/src/vulkan/meson.build b/src/vulkan/meson.build
index 59e1fd1fabe..d123750c73c 100644
--- a/src/vulkan/meson.build
+++ b/src/vulkan/meson.build
@@ -20,7 +20,6 @@
vk_api_xml = files('registry/vk.xml')
-inc_vulkan_util = include_directories('util')
inc_vulkan_wsi = include_directories('wsi')
subdir('util')
diff --git a/src/vulkan/util/meson.build b/src/vulkan/util/meson.build
index 15e4ff49129..e869056acd2 100644
--- a/src/vulkan/util/meson.build
+++ b/src/vulkan/util/meson.build
@@ -43,3 +43,15 @@ libvulkan_util = static_library(
c_args : [c_vis_args],
build_by_default : false,
)
+
+idep_vulkan_util_headers = declare_dependency(
+ sources : vk_enum_to_str[1],
+ include_directories : include_directories('.')
+)
+
+idep_vulkan_util = declare_dependency(
+ sources : vk_enum_to_str[1],
+ link_with : libvulkan_util,
+ include_directories : include_directories('.'),
+ dependencies : idep_vulkan_util_headers
+)
diff --git a/src/vulkan/wsi/meson.build b/src/vulkan/wsi/meson.build
index e9812b663e4..106509502ab 100644
--- a/src/vulkan/wsi/meson.build
+++ b/src/vulkan/wsi/meson.build
@@ -62,8 +62,8 @@ endif
libvulkan_wsi = static_library(
'vulkan_wsi',
files_vulkan_wsi,
- include_directories : [inc_common, inc_vulkan_util, inc_drm_uapi],
- dependencies : [vulkan_wsi_deps, dep_libdrm],
+ include_directories : [inc_common, inc_drm_uapi],
+ dependencies : [vulkan_wsi_deps, dep_libdrm, idep_vulkan_util],
c_args : [c_vis_args, vulkan_wsi_args],
build_by_default : false,
)
diff --git a/src/vulkan/wsi/wsi_common_display.c b/src/vulkan/wsi/wsi_common_display.c
index 20209b0cbd0..ccde1e0e679 100644
--- a/src/vulkan/wsi/wsi_common_display.c
+++ b/src/vulkan/wsi/wsi_common_display.c
@@ -960,8 +960,8 @@ static void
wsi_display_destroy_buffer(struct wsi_display *wsi,
uint32_t buffer)
{
- (void) drmIoctl(wsi->fd, DRM_IOCTL_MODE_DESTROY_DUMB,
- &((struct drm_mode_destroy_dumb) { .handle = buffer }));
+ (void) drmIoctl(wsi->fd, DRM_IOCTL_GEM_CLOSE,
+ &((struct drm_gem_close) { .handle = buffer }));
}
static VkResult
@@ -1798,6 +1798,30 @@ wsi_init_pthread_cond_monotonic(pthread_cond_t *cond)
return ret;
}
+
+/*
+ * Local version fo the libdrm helper. Added to avoid depending on bleeding
+ * edge version of the library.
+ */
+static int
+local_drmIsMaster(int fd)
+{
+ /* Detect master by attempting something that requires master.
+ *
+ * Authenticating magic tokens requires master and 0 is an
+ * internal kernel detail which we could use. Attempting this on
+ * a master fd would fail therefore fail with EINVAL because 0
+ * is invalid.
+ *
+ * A non-master fd will fail with EACCES, as the kernel checks
+ * for master before attempting to do anything else.
+ *
+ * Since we don't want to leak implementation details, use
+ * EACCES.
+ */
+ return drmAuthMagic(fd, 0) != -EACCES;
+}
+
VkResult
wsi_display_init_wsi(struct wsi_device *wsi_device,
const VkAllocationCallbacks *alloc,
@@ -1813,6 +1837,9 @@ wsi_display_init_wsi(struct wsi_device *wsi_device,
}
wsi->fd = display_fd;
+ if (wsi->fd != -1 && !local_drmIsMaster(wsi->fd))
+ wsi->fd = -1;
+
wsi->alloc = alloc;
list_inithead(&wsi->connectors);
diff --git a/src/vulkan/wsi/wsi_common_wayland.c b/src/vulkan/wsi/wsi_common_wayland.c
index c0132566ead..82139de31d9 100644
--- a/src/vulkan/wsi/wsi_common_wayland.c
+++ b/src/vulkan/wsi/wsi_common_wayland.c
@@ -701,9 +701,14 @@ wsi_wl_swapchain_acquire_next_image(struct wsi_swapchain *wsi_chain,
}
}
- /* This time we do a blocking dispatch because we can't go
- * anywhere until we get an event.
+ /* We now have to do a blocking dispatch, because all our images
+ * are in use and we cannot return one until the server does. However,
+ * if the client has requested non-blocking ANI, then we tell it up front
+ * that we have nothing to return.
*/
+ if (info->timeout == 0)
+ return VK_NOT_READY;
+
int ret = wl_display_roundtrip_queue(chain->display->wl_display,
chain->display->queue);
if (ret < 0)